From c9ffe0004e4eb33581e369d94f45e5e2447f2933 Mon Sep 17 00:00:00 2001 From: "James Edwards (xN/A) TX" Date: Sun, 20 Mar 2016 15:39:40 -0500 Subject: [PATCH] Check open source core runtime code into perforce. This includes license and README files. [git-p4: depot-paths = "//depot/stg/hsa/drivers/hsa/runtime/": change = 1249136] --- runtime/hsa-runtime/CMakeLists.txt | 68 + runtime/hsa-runtime/README.md | 166 + .../cmake_modules/COPYING-CMAKE-SCRIPTS | 22 + .../cmake_modules/FindLibElf.cmake | 69 + .../cmake_modules/hsa_common.cmake | 79 + runtime/hsa-runtime/cmake_modules/utils.cmake | 71 + runtime/hsa-runtime/core/CMakeLists.txt | 171 + .../core/common/hsa_table_interface.cpp | 604 +++ runtime/hsa-runtime/core/common/shared.cpp | 48 + runtime/hsa-runtime/core/common/shared.h | 109 + runtime/hsa-runtime/core/hsacore.so.def | 143 + runtime/hsa-runtime/core/inc/agent.h | 264 ++ runtime/hsa-runtime/core/inc/amd_aql_queue.h | 412 ++ .../hsa-runtime/core/inc/amd_blit_kernel.h | 174 + .../hsa-runtime/core/inc/amd_blit_kernel_kv.h | 479 +++ .../hsa-runtime/core/inc/amd_blit_kernel_vi.h | 490 +++ runtime/hsa-runtime/core/inc/amd_blit_sdma.h | 218 + runtime/hsa-runtime/core/inc/amd_cpu_agent.h | 154 + .../hsa-runtime/core/inc/amd_elf_image.hpp | 222 + runtime/hsa-runtime/core/inc/amd_gpu_agent.h | 354 ++ runtime/hsa-runtime/core/inc/amd_hsa_code.hpp | 387 ++ .../hsa-runtime/core/inc/amd_hsa_loader.hpp | 358 ++ runtime/hsa-runtime/core/inc/amd_load_map.h | 174 + .../core/inc/amd_loader_context.hpp | 97 + .../hsa-runtime/core/inc/amd_memory_region.h | 191 + runtime/hsa-runtime/core/inc/amd_topology.h | 56 + runtime/hsa-runtime/core/inc/blit.h | 108 + runtime/hsa-runtime/core/inc/checked.h | 75 + runtime/hsa-runtime/core/inc/default_signal.h | 174 + runtime/hsa-runtime/core/inc/host_queue.h | 167 + .../hsa-runtime/core/inc/hsa_api_trace_int.h | 63 + .../hsa-runtime/core/inc/hsa_ext_interface.h | 80 + runtime/hsa-runtime/core/inc/hsa_internal.h | 347 ++ .../core/inc/hsa_table_interface.h | 47 + .../hsa-runtime/core/inc/interrupt_signal.h | 206 + runtime/hsa-runtime/core/inc/isa.h | 164 + runtime/hsa-runtime/core/inc/memory_region.h | 109 + runtime/hsa-runtime/core/inc/queue.h | 322 ++ runtime/hsa-runtime/core/inc/registers.h | 204 + runtime/hsa-runtime/core/inc/runtime.h | 498 +++ runtime/hsa-runtime/core/inc/signal.h | 269 ++ .../core/runtime/amd_aql_queue.cpp | 856 ++++ .../core/runtime/amd_blit_kernel.cpp | 647 +++ .../core/runtime/amd_blit_sdma.cpp | 858 ++++ .../core/runtime/amd_cpu_agent.cpp | 329 ++ .../core/runtime/amd_gpu_agent.cpp | 863 ++++ .../hsa-runtime/core/runtime/amd_load_map.cpp | 172 + .../core/runtime/amd_loader_context.cpp | 588 +++ .../core/runtime/amd_memory_region.cpp | 555 +++ .../hsa-runtime/core/runtime/amd_topology.cpp | 210 + .../core/runtime/default_signal.cpp | 275 ++ .../hsa-runtime/core/runtime/host_queue.cpp | 99 + runtime/hsa-runtime/core/runtime/hsa.cpp | 1710 ++++++++ .../core/runtime/hsa_api_trace.cpp | 191 + .../hsa-runtime/core/runtime/hsa_ext_amd.cpp | 555 +++ .../core/runtime/hsa_ext_interface.cpp | 530 +++ .../core/runtime/interrupt_signal.cpp | 372 ++ runtime/hsa-runtime/core/runtime/isa.cpp | 130 + runtime/hsa-runtime/core/runtime/runtime.cpp | 1010 +++++ runtime/hsa-runtime/core/runtime/signal.cpp | 187 + .../hsa-runtime/core/util/atomic_helpers.h | 405 ++ .../hsa-runtime/core/util/lnx/os_linux.cpp | 344 ++ runtime/hsa-runtime/core/util/locks.h | 136 + runtime/hsa-runtime/core/util/os.h | 216 + runtime/hsa-runtime/core/util/small_heap.cpp | 174 + runtime/hsa-runtime/core/util/small_heap.h | 114 + runtime/hsa-runtime/core/util/timer.cpp | 105 + runtime/hsa-runtime/core/util/timer.h | 162 + runtime/hsa-runtime/core/util/utils.h | 267 ++ runtime/hsa-runtime/inc/Brig.h | 1530 +++++++ runtime/hsa-runtime/inc/amd_hsa_common.h | 91 + runtime/hsa-runtime/inc/amd_hsa_elf.h | 295 ++ runtime/hsa-runtime/inc/amd_hsa_kernel_code.h | 271 ++ runtime/hsa-runtime/inc/amd_hsa_queue.h | 86 + runtime/hsa-runtime/inc/amd_hsa_signal.h | 80 + runtime/hsa-runtime/inc/hsa.h | 3728 +++++++++++++++++ runtime/hsa-runtime/inc/hsa_api_trace.h | 177 + runtime/hsa-runtime/inc/hsa_ext_amd.h | 1183 ++++++ runtime/hsa-runtime/inc/hsa_ext_finalize.h | 531 +++ runtime/hsa-runtime/inc/hsa_ext_image.h | 964 +++++ .../hsa-runtime/libamdhsacode/CMakeLists.txt | 17 + .../libamdhsacode/amd_elf_image.cpp | 1691 ++++++++ .../libamdhsacode/amd_hsa_code.cpp | 1340 ++++++ .../libamdhsacode/amd_hsa_code_util.cpp | 1033 +++++ .../libamdhsacode/amd_hsa_code_util.hpp | 193 + .../libamdhsacode/amd_hsa_locks.cpp | 94 + .../libamdhsacode/amd_hsa_locks.hpp | 127 + .../hsa-runtime/libamdhsacode/amd_options.cpp | 340 ++ .../hsa-runtime/libamdhsacode/amd_options.hpp | 430 ++ runtime/hsa-runtime/loader/CMakeLists.txt | 16 + runtime/hsa-runtime/loader/executable.cpp | 1175 ++++++ runtime/hsa-runtime/loader/executable.hpp | 465 ++ runtime/hsa-runtime/loader/loaders.cpp | 234 ++ runtime/hsa-runtime/loader/loaders.hpp | 106 + runtime/hsa-runtime/utils/sp3/LICENSE.txt | 19 + runtime/hsa-runtime/utils/sp3/sp3-asic.h | 181 + runtime/hsa-runtime/utils/sp3/sp3-int.h | 553 +++ runtime/hsa-runtime/utils/sp3/sp3-type.h | 137 + runtime/hsa-runtime/utils/sp3/sp3-vm.h | 119 + runtime/hsa-runtime/utils/sp3/sp3.h | 198 + 100 files changed, 37877 insertions(+) create mode 100644 runtime/hsa-runtime/CMakeLists.txt create mode 100644 runtime/hsa-runtime/README.md create mode 100644 runtime/hsa-runtime/cmake_modules/COPYING-CMAKE-SCRIPTS create mode 100644 runtime/hsa-runtime/cmake_modules/FindLibElf.cmake create mode 100644 runtime/hsa-runtime/cmake_modules/hsa_common.cmake create mode 100644 runtime/hsa-runtime/cmake_modules/utils.cmake create mode 100644 runtime/hsa-runtime/core/CMakeLists.txt create mode 100644 runtime/hsa-runtime/core/common/hsa_table_interface.cpp create mode 100644 runtime/hsa-runtime/core/common/shared.cpp create mode 100644 runtime/hsa-runtime/core/common/shared.h create mode 100644 runtime/hsa-runtime/core/hsacore.so.def create mode 100644 runtime/hsa-runtime/core/inc/agent.h create mode 100644 runtime/hsa-runtime/core/inc/amd_aql_queue.h create mode 100644 runtime/hsa-runtime/core/inc/amd_blit_kernel.h create mode 100644 runtime/hsa-runtime/core/inc/amd_blit_kernel_kv.h create mode 100644 runtime/hsa-runtime/core/inc/amd_blit_kernel_vi.h create mode 100644 runtime/hsa-runtime/core/inc/amd_blit_sdma.h create mode 100644 runtime/hsa-runtime/core/inc/amd_cpu_agent.h create mode 100644 runtime/hsa-runtime/core/inc/amd_elf_image.hpp create mode 100644 runtime/hsa-runtime/core/inc/amd_gpu_agent.h create mode 100644 runtime/hsa-runtime/core/inc/amd_hsa_code.hpp create mode 100644 runtime/hsa-runtime/core/inc/amd_hsa_loader.hpp create mode 100644 runtime/hsa-runtime/core/inc/amd_load_map.h create mode 100644 runtime/hsa-runtime/core/inc/amd_loader_context.hpp create mode 100644 runtime/hsa-runtime/core/inc/amd_memory_region.h create mode 100644 runtime/hsa-runtime/core/inc/amd_topology.h create mode 100644 runtime/hsa-runtime/core/inc/blit.h create mode 100644 runtime/hsa-runtime/core/inc/checked.h create mode 100644 runtime/hsa-runtime/core/inc/default_signal.h create mode 100644 runtime/hsa-runtime/core/inc/host_queue.h create mode 100644 runtime/hsa-runtime/core/inc/hsa_api_trace_int.h create mode 100644 runtime/hsa-runtime/core/inc/hsa_ext_interface.h create mode 100644 runtime/hsa-runtime/core/inc/hsa_internal.h create mode 100644 runtime/hsa-runtime/core/inc/hsa_table_interface.h create mode 100644 runtime/hsa-runtime/core/inc/interrupt_signal.h create mode 100644 runtime/hsa-runtime/core/inc/isa.h create mode 100644 runtime/hsa-runtime/core/inc/memory_region.h create mode 100644 runtime/hsa-runtime/core/inc/queue.h create mode 100644 runtime/hsa-runtime/core/inc/registers.h create mode 100644 runtime/hsa-runtime/core/inc/runtime.h create mode 100644 runtime/hsa-runtime/core/inc/signal.h create mode 100644 runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp create mode 100644 runtime/hsa-runtime/core/runtime/amd_blit_kernel.cpp create mode 100644 runtime/hsa-runtime/core/runtime/amd_blit_sdma.cpp create mode 100644 runtime/hsa-runtime/core/runtime/amd_cpu_agent.cpp create mode 100644 runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp create mode 100644 runtime/hsa-runtime/core/runtime/amd_load_map.cpp create mode 100644 runtime/hsa-runtime/core/runtime/amd_loader_context.cpp create mode 100644 runtime/hsa-runtime/core/runtime/amd_memory_region.cpp create mode 100644 runtime/hsa-runtime/core/runtime/amd_topology.cpp create mode 100644 runtime/hsa-runtime/core/runtime/default_signal.cpp create mode 100644 runtime/hsa-runtime/core/runtime/host_queue.cpp create mode 100644 runtime/hsa-runtime/core/runtime/hsa.cpp create mode 100644 runtime/hsa-runtime/core/runtime/hsa_api_trace.cpp create mode 100644 runtime/hsa-runtime/core/runtime/hsa_ext_amd.cpp create mode 100644 runtime/hsa-runtime/core/runtime/hsa_ext_interface.cpp create mode 100644 runtime/hsa-runtime/core/runtime/interrupt_signal.cpp create mode 100644 runtime/hsa-runtime/core/runtime/isa.cpp create mode 100644 runtime/hsa-runtime/core/runtime/runtime.cpp create mode 100644 runtime/hsa-runtime/core/runtime/signal.cpp create mode 100644 runtime/hsa-runtime/core/util/atomic_helpers.h create mode 100644 runtime/hsa-runtime/core/util/lnx/os_linux.cpp create mode 100644 runtime/hsa-runtime/core/util/locks.h create mode 100644 runtime/hsa-runtime/core/util/os.h create mode 100644 runtime/hsa-runtime/core/util/small_heap.cpp create mode 100644 runtime/hsa-runtime/core/util/small_heap.h create mode 100644 runtime/hsa-runtime/core/util/timer.cpp create mode 100644 runtime/hsa-runtime/core/util/timer.h create mode 100644 runtime/hsa-runtime/core/util/utils.h create mode 100644 runtime/hsa-runtime/inc/Brig.h create mode 100644 runtime/hsa-runtime/inc/amd_hsa_common.h create mode 100644 runtime/hsa-runtime/inc/amd_hsa_elf.h create mode 100644 runtime/hsa-runtime/inc/amd_hsa_kernel_code.h create mode 100644 runtime/hsa-runtime/inc/amd_hsa_queue.h create mode 100644 runtime/hsa-runtime/inc/amd_hsa_signal.h create mode 100644 runtime/hsa-runtime/inc/hsa.h create mode 100644 runtime/hsa-runtime/inc/hsa_api_trace.h create mode 100644 runtime/hsa-runtime/inc/hsa_ext_amd.h create mode 100644 runtime/hsa-runtime/inc/hsa_ext_finalize.h create mode 100644 runtime/hsa-runtime/inc/hsa_ext_image.h create mode 100644 runtime/hsa-runtime/libamdhsacode/CMakeLists.txt create mode 100644 runtime/hsa-runtime/libamdhsacode/amd_elf_image.cpp create mode 100644 runtime/hsa-runtime/libamdhsacode/amd_hsa_code.cpp create mode 100644 runtime/hsa-runtime/libamdhsacode/amd_hsa_code_util.cpp create mode 100644 runtime/hsa-runtime/libamdhsacode/amd_hsa_code_util.hpp create mode 100644 runtime/hsa-runtime/libamdhsacode/amd_hsa_locks.cpp create mode 100644 runtime/hsa-runtime/libamdhsacode/amd_hsa_locks.hpp create mode 100644 runtime/hsa-runtime/libamdhsacode/amd_options.cpp create mode 100644 runtime/hsa-runtime/libamdhsacode/amd_options.hpp create mode 100644 runtime/hsa-runtime/loader/CMakeLists.txt create mode 100644 runtime/hsa-runtime/loader/executable.cpp create mode 100644 runtime/hsa-runtime/loader/executable.hpp create mode 100644 runtime/hsa-runtime/loader/loaders.cpp create mode 100644 runtime/hsa-runtime/loader/loaders.hpp create mode 100644 runtime/hsa-runtime/utils/sp3/LICENSE.txt create mode 100644 runtime/hsa-runtime/utils/sp3/sp3-asic.h create mode 100644 runtime/hsa-runtime/utils/sp3/sp3-int.h create mode 100644 runtime/hsa-runtime/utils/sp3/sp3-type.h create mode 100644 runtime/hsa-runtime/utils/sp3/sp3-vm.h create mode 100644 runtime/hsa-runtime/utils/sp3/sp3.h diff --git a/runtime/hsa-runtime/CMakeLists.txt b/runtime/hsa-runtime/CMakeLists.txt new file mode 100644 index 0000000000..20555fe2a5 --- /dev/null +++ b/runtime/hsa-runtime/CMakeLists.txt @@ -0,0 +1,68 @@ +################################################################################ +## +## The University of Illinois/NCSA +## Open Source License (NCSA) +## +## Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +## +## Developed by: +## +## AMD Research and AMD HSA Software Development +## +## Advanced Micro Devices, Inc. +## +## www.amd.com +## +## Permission is hereby granted, free of charge, to any person obtaining a copy +## of this software and associated documentation files (the "Software"), to +## deal with the Software without restriction, including without limitation +## the rights to use, copy, modify, merge, publish, distribute, sublicense, +## and#or sell copies of the Software, and to permit persons to whom the +## Software is furnished to do so, subject to the following conditions: +## +## - Redistributions of source code must retain the above copyright notice, +## this list of conditions and the following disclaimers. +## - Redistributions in binary form must reproduce the above copyright +## notice, this list of conditions and the following disclaimers in +## the documentation and#or other materials provided with the distribution. +## - Neither the names of Advanced Micro Devices, Inc, +## nor the names of its contributors may be used to endorse or promote +## products derived from this Software without specific prior written +## permission. +## +## THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +## IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +## FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +## THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +## OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +## ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +## DEALINGS WITH THE SOFTWARE. +## +################################################################################ + +cmake_minimum_required(VERSION 2.8) + +project(hsa-runtime) + +list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake_modules") + +include(utils) +include(hsa_common) + +if(NOT DEFINED VERSION_STRING) + set (VERSION_STRING "1") +endif() + +parse_version(${VERSION_STRING}) + +include_directories(${CMAKE_CURRENT_SOURCE_DIR}) +include_directories(${CMAKE_CURRENT_SOURCE_DIR}/inc) +include_directories(${CMAKE_CURRENT_SOURCE_DIR}/core/inc) +include_directories(${CMAKE_CURRENT_SOURCE_DIR}/libamdhsacode) +include_directories($ENV{HSATHK_BUILD_INC_PATH}) + +link_directories ($ENV{HSATHK_BUILD_LIB_PATH}) + +add_subdirectory(libamdhsacode) +add_subdirectory(loader) +add_subdirectory(core) diff --git a/runtime/hsa-runtime/README.md b/runtime/hsa-runtime/README.md new file mode 100644 index 0000000000..3d13340c39 --- /dev/null +++ b/runtime/hsa-runtime/README.md @@ -0,0 +1,166 @@ +### Package Contents + +This directory contains the HSA Runtime source code for the Boltzmann release. It has been modified to support +AMD/ATI discrete GPUs. + +#### Source & Include directories + +core - Contains the source code for AMD's implementation of the core HSA Runtime API's. + +cmake_modules - CMake support modules and files. + +inc - Contains the public and AMD specific header files exposing the HSA Runtimes interfaces. + +libamdhsacode - HSAIL/Finalizer runtime interface. + +loader - Used to load code objects. + +utils - Utilities required to build the core runtime. + +#### Build environment + +CMake build framework is used to build the HSA runtime. The minimum version is 2.8. + +Obtain cmake infrastructure: http://www.cmake.org/download/ + +Export cmake bin into your PATH +HSA Runtime CMake build file CMakeLists.txt is located in runtime/core folder. + +#### Package Dependencies + +The following support packages are requried to succesfully build the runtime: + +* libelf-dev +* g++ +* libc6-dev-i386 (for libhsakmt 32bit) + +#### Building the runtime + +To build the runtime a compatible version of the libhsakmt library and the +hsakmt.h header file must be available. The latest version of these files +can be obtained from the ROCT-Thunk-Interface repository, available here: + +https://github.com/RadeonOpenCompute/ROCT-Thunk-Interface + +Specify the directory containing libhsakmt.so.1 and hsakmt.h using the following +environment variables: + +HSATHK_BUILD_INC_PATH - Set to the dirctory containing hsakmt.h. + +HSATHK_BUILD_LIB_PATH - Set to the directory containing libhsakmt.so.1 + +After setting these variables, create a build directory and invoke cmake on +the top level CMakeLists.txt file. For example, from the top level ROCR +repository execute: + +mkdir build && cd build && cmake ../src && make + +The name of the core hsa runtime is libhsa-runtime64.so.1. + +#### External requirements + +The core runtime requires the sp3.a library to be able to compiler +on x86_64 architechtures. The binaries for the sp3.a librariy can +be found on the amd-codexl-analyzer GitHub repository: + +https://github.com/GPUOpen-Tools/amd-codexl-analyzer + +The x86_64 library and associated header files have been added to +this code base for convenience, but are still subject to the +AMD copyright license. + +#### Specs + +http://www.hsafoundation.com/standards/ + +HSA Runtime Specification 1.0 + +HSA Programmer Reference Manual Specification 1.0 + +HSA Platform System Architecture Specification 1.0 + +#### Runtime Design overview + +The AMD HSA runtime consists of three primary layers: + +C interface adaptors +C++ interfaces classes and common functions +AMD device specific implementations +Additionally the runtime is dependent on a small utility library which provides simple common functions, limited operating system and compiler abstraction, as well as atomic operation interfaces. + +#### C interface adaptors + +Files : + +hsa.h(cpp) + +hsa_ext_interface.h(cpp) + +The C interface layer provides C99 APIs as defined in the HSA Runtime Specification 1.0. The interfaces and default definitions for the standard extensions are also provided. The interface functions simply forward to a function pointer table defined here. The table is initialized to point to default definitions, which simply return an appropriate error code. If available the extension library is loaded as part of runtime initialization and the table is updated to point into the extension library. In this release the standard extensions (image support and finalizer) are implemented in a separate libraries (not open sourced), and can be obtained from the HSA-Runtime-AMD git repository. + +#### C++ interfaces classes and common functions + +Files : + +runtime.h(cpp) + +agent.h + +queue.h + +signal.h + +memory_region.h(cpp) + +checked.h + +memory_database.h(cpp) + +default_signal.h(cpp) + +The C++ interface layer provides abstract interface classes encapsulating commands to HSA Signals, Agents, and Queues. This layer also contains the implementation of device independent commands, such as hsa_init and hsa_system_get_info, and a default signal and queue implementation. + +#### Device Specific Implementations + +Files: + +amd_cpu_agent.h(cpp) + +amd_gpu_agent.h(cpp) + +amd_hw_aql_command_processor.h(cpp) + +amd_memory_region.h(cpp) + +amd_memory_registration.h(cpp) + +amd_topology.h(cpp) + +host_queue.h(cpp) + +interrupt_signal.h(cpp) + +hsa_ext_private_amd.h(cpp) + +The device specific layer contains implementations of the C++ interface classes which implement HSA functionality for AMD Kaveri & Carrizo APUs. + +#### Implemented functionality + +* The following queries are not implemented: + ** hsa_code_symbol_get_info: HSA_CODE_SYMBOL_INFO_INDIRECT_FUNCTION_CALL_CONVENTION + ** hsa_executable_symbol_get_info: HSA_EXECUTABLE_SYMBOL_INFO_INDIRECT_FUNCTION_OBJECT, HSA_EXECUTABLE_SYMBOL_INFO_INDIRECT_FUNCTION_CALL_CONVENTION + +#### Known Issues + +* Max total coarse grain region limit is 8GB. +* hsa_agent_get_exception_policies is not implemented. +* Image import/export/copy/fill only support image created with memory from host accessible region. +* hsa_system_get_extension_table is not implemented for HSA_EXTENSION_AMD_PROFILER. + +#### Disclaimer + +The information contained herein is for informational purposes only, and is subject to change without notice. While every precaution has been taken in the preparation of this document, it may contain technical inaccuracies, omissions and typographical errors, and AMD is under no obligation to update or otherwise correct this information. Advanced Micro Devices, Inc. makes no representations or warranties with respect to the accuracy or completeness of the contents of this document, and assumes no liability of any kind, including the implied warranties of noninfringement, merchantability or fitness for particular purposes, with respect to the operation or use of AMD hardware, software or other products described herein. No license, including implied or arising by estoppel, to any intellectual property rights is granted by this document. Terms and limitations applicable to the purchase or use of AMD's products are as set forth in a signed agreement between the parties or in AMD's Standard Terms and Conditions of Sale. + +AMD, the AMD Arrow logo, and combinations thereof are trademarks of Advanced Micro Devices, Inc. Other product names used in this publication are for identification purposes only and may be trademarks of their respective companies. + +Copyright (c) 2014-2015 Advanced Micro Devices, Inc. All rights reserved. diff --git a/runtime/hsa-runtime/cmake_modules/COPYING-CMAKE-SCRIPTS b/runtime/hsa-runtime/cmake_modules/COPYING-CMAKE-SCRIPTS new file mode 100644 index 0000000000..4b417765f3 --- /dev/null +++ b/runtime/hsa-runtime/cmake_modules/COPYING-CMAKE-SCRIPTS @@ -0,0 +1,22 @@ +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the copyright + notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. +3. The name of the author may not be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR +IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. +IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, +INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT +NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF +THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/runtime/hsa-runtime/cmake_modules/FindLibElf.cmake b/runtime/hsa-runtime/cmake_modules/FindLibElf.cmake new file mode 100644 index 0000000000..690eccd209 --- /dev/null +++ b/runtime/hsa-runtime/cmake_modules/FindLibElf.cmake @@ -0,0 +1,69 @@ +# - Try to find libelf +# Once done this will define +# +# LIBELF_FOUND - system has libelf +# LIBELF_INCLUDE_DIRS - the libelf include directory +# LIBELF_LIBRARIES - Link these to use libelf +# LIBELF_DEFINITIONS - Compiler switches required for using libelf +# +# Copyright (c) 2008 Bernhard Walle +# +# Redistribution and use is allowed according to the terms of the New +# BSD license. +# For details see the accompanying COPYING-CMAKE-SCRIPTS file. +# + +if (LIBELF_FOUND) + return() +endif (LIBELF_FOUND) + +find_path (LIBELF_INCLUDE_DIRS + NAMES + libelf.h + PATHS + /usr/include + /usr/include/libelf + /usr/local/include + /usr/local/include/libelf + /opt/local/include + /opt/local/include/libelf + /sw/include + /sw/include/libelf + ENV CPATH) + +find_library (LIBELF_LIBRARIES + NAMES + elf + PATHS + /usr/lib + /usr/local/lib + /opt/local/lib + /sw/lib + ENV LIBRARY_PATH + ENV LD_LIBRARY_PATH) + +include (FindPackageHandleStandardArgs) + + +# handle the QUIETLY and REQUIRED arguments and set LIBELF_FOUND to TRUE if all listed variables are TRUE +FIND_PACKAGE_HANDLE_STANDARD_ARGS(LibElf DEFAULT_MSG + LIBELF_LIBRARIES + LIBELF_INCLUDE_DIRS) + +SET(CMAKE_REQUIRED_LIBRARIES elf) +INCLUDE(CheckCXXSourceCompiles) +CHECK_CXX_SOURCE_COMPILES("#include +int main() { + Elf *e = (Elf*)0; + size_t sz; + elf_getshdrstrndx(e, &sz); + return 0; +}" ELF_GETSHDRSTRNDX) + +mark_as_advanced(LIBELF_INCLUDE_DIRS LIBELF_LIBRARIES ELF_GETSHDRSTRNDX) + +if(LIBELF_FOUND) + add_library(elf UNKNOWN IMPORTED) + set_property(TARGET elf PROPERTY IMPORTED_LOCATION ${LIBELF_LIBRARIES}) + set_property(TARGET elf PROPERTY INTERFACE_INCLUDE_DIRECTORIES ${LIBELF_INCLUDE_DIRS}) +endif() diff --git a/runtime/hsa-runtime/cmake_modules/hsa_common.cmake b/runtime/hsa-runtime/cmake_modules/hsa_common.cmake new file mode 100644 index 0000000000..0f7dd57a05 --- /dev/null +++ b/runtime/hsa-runtime/cmake_modules/hsa_common.cmake @@ -0,0 +1,79 @@ +################################################################################ +## +## The University of Illinois/NCSA +## Open Source License (NCSA) +## +## Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +## +## Developed by: +## +## AMD Research and AMD HSA Software Development +## +## Advanced Micro Devices, Inc. +## +## www.amd.com +## +## Permission is hereby granted, free of charge, to any person obtaining a copy +## of this software and associated documentation files (the "Software"), to +## deal with the Software without restriction, including without limitation +## the rights to use, copy, modify, merge, publish, distribute, sublicense, +## and#or sell copies of the Software, and to permit persons to whom the +## Software is furnished to do so, subject to the following conditions: +## +## - Redistributions of source code must retain the above copyright notice, +## this list of conditions and the following disclaimers. +## - Redistributions in binary form must reproduce the above copyright +## notice, this list of conditions and the following disclaimers in +## the documentation and#or other materials provided with the distribution. +## - Neither the names of Advanced Micro Devices, Inc, +## nor the names of its contributors may be used to endorse or promote +## products derived from this Software without specific prior written +## permission. +## +## THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +## IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +## FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +## THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +## OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +## ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +## DEALINGS WITH THE SOFTWARE. +## +################################################################################ + +# +# HSA Build compiler definitions common between components. +# + +set(IS64BIT 0) +set(ONLY64STR "32") +if(CMAKE_SIZEOF_VOID_P EQUAL 8) + set(IS64BIT 1) + set(ONLY64STR "64") +endif() + +if(UNIX) + set(PS ":") + set(CMAKE_CXX_FLAGS "-Wall -std=c++11 ${EXTRA_CFLAGS}") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fpic") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,--unresolved-symbols=ignore-in-shared-libs") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-strict-aliasing") + if ( IS64BIT ) + set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m64 -msse -msse2" ) + else () + set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m32" ) + endif () + if ( "${CMAKE_BUILD_TYPE}" STREQUAL Debug ) + set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ggdb" ) + endif () + add_definitions(-D __STDC_LIMIT_MACROS) + add_definitions(-D __STDC_CONSTANT_MACROS) + add_definitions(-D __STDC_FORMAT_MACROS) + add_definitions (-DLITTLEENDIAN_CPU=1) +else() + set (PS "\;") +endif() + +if(MSVC) + set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /MT") + set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /MTd") +endif() diff --git a/runtime/hsa-runtime/cmake_modules/utils.cmake b/runtime/hsa-runtime/cmake_modules/utils.cmake new file mode 100644 index 0000000000..58067a4d3c --- /dev/null +++ b/runtime/hsa-runtime/cmake_modules/utils.cmake @@ -0,0 +1,71 @@ +################################################################################ +## +## The University of Illinois/NCSA +## Open Source License (NCSA) +## +## Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +## +## Developed by: +## +## AMD Research and AMD HSA Software Development +## +## Advanced Micro Devices, Inc. +## +## www.amd.com +## +## Permission is hereby granted, free of charge, to any person obtaining a copy +## of this software and associated documentation files (the "Software"), to +## deal with the Software without restriction, including without limitation +## the rights to use, copy, modify, merge, publish, distribute, sublicense, +## and#or sell copies of the Software, and to permit persons to whom the +## Software is furnished to do so, subject to the following conditions: +## +## - Redistributions of source code must retain the above copyright notice, +## this list of conditions and the following disclaimers. +## - Redistributions in binary form must reproduce the above copyright +## notice, this list of conditions and the following disclaimers in +## the documentation and#or other materials provided with the distribution. +## - Neither the names of Advanced Micro Devices, Inc, +## nor the names of its contributors may be used to endorse or promote +## products derived from this Software without specific prior written +## permission. +## +## THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +## IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +## FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +## THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +## OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +## ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +## DEALINGS WITH THE SOFTWARE. +## +################################################################################ + +## Parses the VERSION_STRING variable and places +## the first, second and third number values in +## the major, minor and patch variables. +function(parse_version VERSION_STRING) + + string(REGEX MATCHALL "[0123456789]+" VERSIONS ${VERSION_STRING}) + list(LENGTH VERSIONS VERSION_COUNT) + + if (${VERSION_COUNT} GREATER 0) + list(GET VERSIONS 0 MAJOR) + set(VERSION_MAJOR ${MAJOR} PARENT_SCOPE) + set(TEMP_VERSION_STRING "${MAJOR}") + endif () + + if (${VERSION_COUNT} GREATER 1) + list(GET VERSIONS 1 MINOR) + set(VERSION_MINOR ${MINOR} PARENT_SCOPE) + set(TEMP_VERSION_STRING "${TEMP_VERSION_STRING}.${MINOR}") + endif () + + if (${VERSION_COUNT} GREATER 2) + list(GET VERSIONS 2 PATCH) + set(VERSION_PATCH ${PATCH} PARENT_SCOPE) + set(TEMP_VERSION_STRING "${TEMP_VERSION_STRING}.${PATCH}") + endif () + + set(VERSION_STRING "${TEMP_VERSION_STRING}" PARENT_SCOPE) + +endfunction() diff --git a/runtime/hsa-runtime/core/CMakeLists.txt b/runtime/hsa-runtime/core/CMakeLists.txt new file mode 100644 index 0000000000..3ce04a041d --- /dev/null +++ b/runtime/hsa-runtime/core/CMakeLists.txt @@ -0,0 +1,171 @@ +################################################################################ +## +## The University of Illinois/NCSA +## Open Source License (NCSA) +## +## Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +## +## Developed by: +## +## AMD Research and AMD HSA Software Development +## +## Advanced Micro Devices, Inc. +## +## www.amd.com +## +## Permission is hereby granted, free of charge, to any person obtaining a copy +## of this software and associated documentation files (the "Software"), to +## deal with the Software without restriction, including without limitation +## the rights to use, copy, modify, merge, publish, distribute, sublicense, +## and#or sell copies of the Software, and to permit persons to whom the +## Software is furnished to do so, subject to the following conditions: +## +## - Redistributions of source code must retain the above copyright notice, +## this list of conditions and the following disclaimers. +## - Redistributions in binary form must reproduce the above copyright +## notice, this list of conditions and the following disclaimers in +## the documentation and#or other materials provided with the distribution. +## - Neither the names of Advanced Micro Devices, Inc, +## nor the names of its contributors may be used to endorse or promote +## products derived from this Software without specific prior written +## permission. +## +## THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +## IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +## FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +## THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +## OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +## ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +## DEALINGS WITH THE SOFTWARE. +## +################################################################################ + +cmake_minimum_required ( VERSION 2.8.0 ) +## GCC 4.8 or higher compiler required. + +if ( WIN32 ) + MESSAGE ( FATAL_ERROR "Windows build is not supported." ) +endif () + +list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/../cmake_modules") + +## Process environment variables. +if ( "$ENV{HSATHK_BUILD_TARGET_BITS}" STREQUAL 32 ) + set ( ONLY64STR "" ) + set ( IS64BIT 0 ) +else () + set ( ONLY64STR "64" ) + set ( IS64BIT 1 ) +endif () + +if ( NOT EXISTS $ENV{HSATHK_BUILD_INC_PATH}/hsakmt.h ) + MESSAGE ( FATAL_ERROR "Environment variable HSATHK_BUILD_INC_PATH is not set to point to the location where KFD Thunk header file hsakmt.h (and rest of the thunk headers) could be found." ) +endif () + +if ( NOT EXISTS $ENV{HSATHK_BUILD_LIB_PATH}/libhsakmt.so.1 ) + MESSAGE ( FATAL_ERROR "Environment variable HSATHK_BUILD_LIB_PATH is not set to point to the location where KFD Thunk library libhsakmt.so.1 could be found." ) +endif () + +if ( EXISTS $ENV{LIBSP3_BUILD_INC_PATH}/sp3.h ) + set ( LIBSP3_BUILD_INC_PATH $ENV{LIBSP3_BUILD_INC_PATH} ) +else () + set ( LIBSP3_BUILD_INC_PATH ${CMAKE_CURRENT_SOURCE_DIR}/../utils/sp3 ) +endif () + +if ( EXISTS $ENV{LIBSP3_BUILD_LIB_PATH}/libsp3.a ) + set ( LIBSP3_BUILD_LIB_PATH $ENV{LIBSP3_BUILD_LIB_PATH} ) +else () + set ( LIBSP3_BUILD_LIB_PATH ${CMAKE_CURRENT_SOURCE_DIR}/../utils/sp3 ) +endif () + +MESSAGE ( ------IS64BIT: ${IS64BIT} ) +MESSAGE ( ------Compiler: ${CMAKE_CXX_COMPILER} ) +MESSAGE ( ------Version: ${CMAKE_CXX_COMPILER_VERSION} ) + +## Set core runtime module name and project name. +set ( CORE_RUNTIME_NAME "hsa-runtime" ) +set ( CORE_RUNTIME_COMPONENT "lib${CORE_RUNTIME_NAME}" ) +set ( CORE_RUNTIME_TARGET "${CORE_RUNTIME_NAME}${ONLY64STR}" ) +project ( ${CORE_RUNTIME_TARGET} ) + +## Verbose output. +set ( CMAKE_VERBOSE_MAKEFILE on ) + +## Compiler preproc definitions. +add_definitions ( -D__linux__ ) +add_definitions ( -DHSA_EXPORT=1 ) +add_definitions ( -DHSA_EXPORT_FINALIZER=1 ) +add_definitions ( -DHSA_EXPORT_IMAGES=1 ) + +## ------------------------- Linux Compiler and Linker options ------------------------- +set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror -fexceptions -fno-rtti -fvisibility=hidden -Wno-error=sign-compare -Wno-sign-compare -Wno-write-strings -Wno-deprecated-declarations -Wno-conversion-null -fno-math-errno -fno-threadsafe-statics -fmerge-all-constants -fms-extensions -Wno-error=comment -Wno-comment -Wno-error=pointer-arith -Wno-pointer-arith -Wno-error=unused-variable -Wno-error=unused-but-set-variable -Wno-error=unused-function" ) + +set ( DRVDEF "${CMAKE_CURRENT_SOURCE_DIR}/hsacore.so.def" ) + +set ( CMAKE_SHARED_LINKER_FLAGS "-Wl,-Bdynamic -Wl,-z,noexecstack -Wl,--version-script=${DRVDEF}" ) + +set ( CMAKE_SKIP_BUILD_RPATH TRUE) + +## ------------------------- End Compiler and Linker options ---------------------------- + +## Source files. +set ( CORE_SRCS util/lnx/os_linux.cpp ) +set ( CORE_SRCS ${CORE_SRCS} util/small_heap.cpp ) +set ( CORE_SRCS ${CORE_SRCS} util/timer.cpp ) +set ( CORE_SRCS ${CORE_SRCS} runtime/amd_blit_kernel.cpp ) +set ( CORE_SRCS ${CORE_SRCS} runtime/amd_blit_sdma.cpp ) +set ( CORE_SRCS ${CORE_SRCS} runtime/amd_cpu_agent.cpp ) +set ( CORE_SRCS ${CORE_SRCS} runtime/amd_gpu_agent.cpp ) +set ( CORE_SRCS ${CORE_SRCS} runtime/amd_aql_queue.cpp ) +set ( CORE_SRCS ${CORE_SRCS} runtime/amd_loader_context.cpp ) +set ( CORE_SRCS ${CORE_SRCS} runtime/amd_load_map.cpp ) +set ( CORE_SRCS ${CORE_SRCS} runtime/amd_memory_region.cpp ) +set ( CORE_SRCS ${CORE_SRCS} runtime/amd_topology.cpp ) +set ( CORE_SRCS ${CORE_SRCS} runtime/default_signal.cpp ) +set ( CORE_SRCS ${CORE_SRCS} runtime/host_queue.cpp ) +set ( CORE_SRCS ${CORE_SRCS} runtime/hsa.cpp ) +set ( CORE_SRCS ${CORE_SRCS} runtime/hsa_api_trace.cpp ) +set ( CORE_SRCS ${CORE_SRCS} runtime/hsa_ext_amd.cpp ) +set ( CORE_SRCS ${CORE_SRCS} runtime/hsa_ext_interface.cpp ) +set ( CORE_SRCS ${CORE_SRCS} runtime/interrupt_signal.cpp ) +set ( CORE_SRCS ${CORE_SRCS} runtime/isa.cpp ) +set ( CORE_SRCS ${CORE_SRCS} runtime/runtime.cpp ) +set ( CORE_SRCS ${CORE_SRCS} runtime/signal.cpp ) +set ( CORE_SRCS ${CORE_SRCS} common/shared.cpp ) +set ( CORE_SRCS ${CORE_SRCS} common/hsa_table_interface.cpp ) + +## Include path(s). +include_directories ( ${CMAKE_CURRENT_SOURCE_DIR}/.. ) +include_directories ( ${CMAKE_CURRENT_SOURCE_DIR}/../inc ) +include_directories ( ${CMAKE_CURRENT_SOURCE_DIR}/inc ) +include_directories ( $ENV{HSATHK_BUILD_INC_PATH} ) +include_directories ( ${LIBSP3_BUILD_INC_PATH} ) + +## Library path(s). +link_directories ( $ENV{HSATHK_BUILD_LIB_PATH} ) +link_directories ( ${LIBSP3_BUILD_LIB_PATH} ) + +add_library ( ${CORE_RUNTIME_TARGET} SHARED ${CORE_SRCS} ) + +## Set the VERSION and SOVERSION values +if ( DEFINED VERSION_STRING ) + set_property ( TARGET ${CORE_RUNTIME_TARGET} PROPERTY VERSION "${VERSION_STRING}" ) +endif () + +set_property ( TARGET ${CORE_RUNTIME_TARGET} PROPERTY SOVERSION "${VERSION_MAJOR}" ) + +target_link_libraries ( ${CORE_RUNTIME_TARGET} + PRIVATE amdhsaloader + PRIVATE amdhsacode + PRIVATE hsakmt + PRIVATE sp3 + dl pthread rt +) + +## If the build is Release, strip the target library +if ( "${CMAKE_BUILD_TYPE}" STREQUAL Release ) + add_custom_command ( TARGET ${CORE_RUNTIME_TARGET} POST_BUILD COMMAND ${CMAKE_STRIP} *.so ) +endif () + +## Set install information +install ( TARGETS ${CORE_RUNTIME_TARGET} LIBRARY DESTINATION lib COMPONENT ${CORE_RUNTIME_COMPONENT}) diff --git a/runtime/hsa-runtime/core/common/hsa_table_interface.cpp b/runtime/hsa-runtime/core/common/hsa_table_interface.cpp new file mode 100644 index 0000000000..4e1b6d44b0 --- /dev/null +++ b/runtime/hsa-runtime/core/common/hsa_table_interface.cpp @@ -0,0 +1,604 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#include "hsa_api_trace.h" + +static const ApiTable* HsaApiTable; + +void hsa_table_interface_init(const ApiTable* Table) { HsaApiTable = Table; } + +const ApiTable* hsa_table_interface_get_table() { return HsaApiTable; } + +// Pass through stub functions +hsa_status_t HSA_API hsa_init() { return HsaApiTable->hsa_init_fn(); } + +hsa_status_t HSA_API hsa_shut_down() { return HsaApiTable->hsa_shut_down_fn(); } + +hsa_status_t HSA_API + hsa_system_get_info(hsa_system_info_t attribute, void* value) { + return HsaApiTable->hsa_system_get_info_fn(attribute, value); +} + +hsa_status_t HSA_API + hsa_system_extension_supported(uint16_t extension, uint16_t version_major, + uint16_t version_minor, bool* result) { + return HsaApiTable->hsa_system_extension_supported_fn( + extension, version_major, version_minor, result); +} + +hsa_status_t HSA_API + hsa_system_get_extension_table(uint16_t extension, uint16_t version_major, + uint16_t version_minor, void* table) { + return HsaApiTable->hsa_system_get_extension_table_fn( + extension, version_major, version_minor, table); +} + +hsa_status_t HSA_API + hsa_iterate_agents(hsa_status_t (*callback)(hsa_agent_t agent, void* data), + void* data) { + return HsaApiTable->hsa_iterate_agents_fn(callback, data); +} + +hsa_status_t HSA_API hsa_agent_get_info(hsa_agent_t agent, + hsa_agent_info_t attribute, + void* value) { + return HsaApiTable->hsa_agent_get_info_fn(agent, attribute, value); +} + +hsa_status_t HSA_API hsa_agent_get_exception_policies(hsa_agent_t agent, + hsa_profile_t profile, + uint16_t* mask) { + return HsaApiTable->hsa_agent_get_exception_policies_fn(agent, profile, mask); +} + +hsa_status_t HSA_API + hsa_agent_extension_supported(uint16_t extension, hsa_agent_t agent, + uint16_t version_major, + uint16_t version_minor, bool* result) { + return HsaApiTable->hsa_agent_extension_supported_fn( + extension, agent, version_major, version_minor, result); +} + +hsa_status_t HSA_API + hsa_queue_create(hsa_agent_t agent, uint32_t size, hsa_queue_type_t type, + void (*callback)(hsa_status_t status, hsa_queue_t* source, + void* data), + void* data, uint32_t private_segment_size, + uint32_t group_segment_size, hsa_queue_t** queue) { + return HsaApiTable->hsa_queue_create_fn(agent, size, type, callback, data, + private_segment_size, + group_segment_size, queue); +} + +hsa_status_t HSA_API + hsa_soft_queue_create(hsa_region_t region, uint32_t size, + hsa_queue_type_t type, uint32_t features, + hsa_signal_t completion_signal, hsa_queue_t** queue) { + return HsaApiTable->hsa_soft_queue_create_fn(region, size, type, features, + completion_signal, queue); +} + +hsa_status_t HSA_API hsa_queue_destroy(hsa_queue_t* queue) { + return HsaApiTable->hsa_queue_destroy_fn(queue); +} + +hsa_status_t HSA_API hsa_queue_inactivate(hsa_queue_t* queue) { + return HsaApiTable->hsa_queue_inactivate_fn(queue); +} + +uint64_t HSA_API hsa_queue_load_read_index_acquire(const hsa_queue_t* queue) { + return HsaApiTable->hsa_queue_load_read_index_acquire_fn(queue); +} + +uint64_t HSA_API hsa_queue_load_read_index_relaxed(const hsa_queue_t* queue) { + return HsaApiTable->hsa_queue_load_read_index_relaxed_fn(queue); +} + +uint64_t HSA_API hsa_queue_load_write_index_acquire(const hsa_queue_t* queue) { + return HsaApiTable->hsa_queue_load_write_index_acquire_fn(queue); +} + +uint64_t HSA_API hsa_queue_load_write_index_relaxed(const hsa_queue_t* queue) { + return HsaApiTable->hsa_queue_load_write_index_relaxed_fn(queue); +} + +void HSA_API hsa_queue_store_write_index_relaxed(const hsa_queue_t* queue, + uint64_t value) { + return HsaApiTable->hsa_queue_store_write_index_relaxed_fn(queue, value); +} + +void HSA_API hsa_queue_store_write_index_release(const hsa_queue_t* queue, + uint64_t value) { + return HsaApiTable->hsa_queue_store_write_index_release_fn(queue, value); +} + +uint64_t HSA_API hsa_queue_cas_write_index_acq_rel(const hsa_queue_t* queue, + uint64_t expected, + uint64_t value) { + return HsaApiTable->hsa_queue_cas_write_index_acq_rel_fn(queue, expected, + value); +} + +uint64_t HSA_API hsa_queue_cas_write_index_acquire(const hsa_queue_t* queue, + uint64_t expected, + uint64_t value) { + return HsaApiTable->hsa_queue_cas_write_index_acquire_fn(queue, expected, + value); +} + +uint64_t HSA_API hsa_queue_cas_write_index_relaxed(const hsa_queue_t* queue, + uint64_t expected, + uint64_t value) { + return HsaApiTable->hsa_queue_cas_write_index_relaxed_fn(queue, expected, + value); +} + +uint64_t HSA_API hsa_queue_cas_write_index_release(const hsa_queue_t* queue, + uint64_t expected, + uint64_t value) { + return HsaApiTable->hsa_queue_cas_write_index_release_fn(queue, expected, + value); +} + +uint64_t HSA_API hsa_queue_add_write_index_acq_rel(const hsa_queue_t* queue, + uint64_t value) { + return HsaApiTable->hsa_queue_add_write_index_acq_rel_fn(queue, value); +} + +uint64_t HSA_API hsa_queue_add_write_index_acquire(const hsa_queue_t* queue, + uint64_t value) { + return HsaApiTable->hsa_queue_add_write_index_acquire_fn(queue, value); +} + +uint64_t HSA_API hsa_queue_add_write_index_relaxed(const hsa_queue_t* queue, + uint64_t value) { + return HsaApiTable->hsa_queue_add_write_index_relaxed_fn(queue, value); +} + +uint64_t HSA_API hsa_queue_add_write_index_release(const hsa_queue_t* queue, + uint64_t value) { + return HsaApiTable->hsa_queue_add_write_index_release_fn(queue, value); +} + +void HSA_API hsa_queue_store_read_index_relaxed(const hsa_queue_t* queue, + uint64_t value) { + return HsaApiTable->hsa_queue_store_read_index_relaxed_fn(queue, value); +} + +void HSA_API hsa_queue_store_read_index_release(const hsa_queue_t* queue, + uint64_t value) { + return HsaApiTable->hsa_queue_store_read_index_release_fn(queue, value); +} + +hsa_status_t HSA_API hsa_agent_iterate_regions( + hsa_agent_t agent, + hsa_status_t (*callback)(hsa_region_t region, void* data), void* data) { + return HsaApiTable->hsa_agent_iterate_regions_fn(agent, callback, data); +} + +hsa_status_t HSA_API hsa_region_get_info(hsa_region_t region, + hsa_region_info_t attribute, + void* value) { + return HsaApiTable->hsa_region_get_info_fn(region, attribute, value); +} + +hsa_status_t HSA_API hsa_memory_register(void* address, size_t size) { + return HsaApiTable->hsa_memory_register_fn(address, size); +} + +hsa_status_t HSA_API hsa_memory_deregister(void* address, size_t size) { + return HsaApiTable->hsa_memory_deregister_fn(address, size); +} + +hsa_status_t HSA_API + hsa_memory_allocate(hsa_region_t region, size_t size, void** ptr) { + return HsaApiTable->hsa_memory_allocate_fn(region, size, ptr); +} + +hsa_status_t HSA_API hsa_memory_free(void* ptr) { + return HsaApiTable->hsa_memory_free_fn(ptr); +} + +hsa_status_t HSA_API hsa_memory_copy(void* dst, const void* src, size_t size) { + return HsaApiTable->hsa_memory_copy_fn(dst, src, size); +} + +hsa_status_t HSA_API hsa_memory_assign_agent(void* ptr, hsa_agent_t agent, + hsa_access_permission_t access) { + return HsaApiTable->hsa_memory_assign_agent_fn(ptr, agent, access); +} + +hsa_status_t HSA_API + hsa_signal_create(hsa_signal_value_t initial_value, uint32_t num_consumers, + const hsa_agent_t* consumers, hsa_signal_t* signal) { + return HsaApiTable->hsa_signal_create_fn(initial_value, num_consumers, + consumers, signal); +} + +hsa_status_t HSA_API hsa_signal_destroy(hsa_signal_t signal) { + return HsaApiTable->hsa_signal_destroy_fn(signal); +} + +hsa_signal_value_t HSA_API hsa_signal_load_relaxed(hsa_signal_t signal) { + return HsaApiTable->hsa_signal_load_relaxed_fn(signal); +} + +hsa_signal_value_t HSA_API hsa_signal_load_acquire(hsa_signal_t signal) { + return HsaApiTable->hsa_signal_load_acquire_fn(signal); +} + +void HSA_API + hsa_signal_store_relaxed(hsa_signal_t signal, hsa_signal_value_t value) { + return HsaApiTable->hsa_signal_store_relaxed_fn(signal, value); +} + +void HSA_API + hsa_signal_store_release(hsa_signal_t signal, hsa_signal_value_t value) { + return HsaApiTable->hsa_signal_store_release_fn(signal, value); +} + +hsa_signal_value_t HSA_API + hsa_signal_wait_relaxed(hsa_signal_t signal, + hsa_signal_condition_t condition, + hsa_signal_value_t compare_value, + uint64_t timeout_hint, + hsa_wait_state_t wait_expectancy_hint) { + return HsaApiTable->hsa_signal_wait_relaxed_fn( + signal, condition, compare_value, timeout_hint, wait_expectancy_hint); +} + +hsa_signal_value_t HSA_API + hsa_signal_wait_acquire(hsa_signal_t signal, + hsa_signal_condition_t condition, + hsa_signal_value_t compare_value, + uint64_t timeout_hint, + hsa_wait_state_t wait_expectancy_hint) { + return HsaApiTable->hsa_signal_wait_acquire_fn( + signal, condition, compare_value, timeout_hint, wait_expectancy_hint); +} + +void HSA_API + hsa_signal_and_relaxed(hsa_signal_t signal, hsa_signal_value_t value) { + return HsaApiTable->hsa_signal_and_relaxed_fn(signal, value); +} + +void HSA_API + hsa_signal_and_acquire(hsa_signal_t signal, hsa_signal_value_t value) { + return HsaApiTable->hsa_signal_and_acquire_fn(signal, value); +} + +void HSA_API + hsa_signal_and_release(hsa_signal_t signal, hsa_signal_value_t value) { + return HsaApiTable->hsa_signal_and_release_fn(signal, value); +} + +void HSA_API + hsa_signal_and_acq_rel(hsa_signal_t signal, hsa_signal_value_t value) { + return HsaApiTable->hsa_signal_and_acq_rel_fn(signal, value); +} + +void HSA_API + hsa_signal_or_relaxed(hsa_signal_t signal, hsa_signal_value_t value) { + return HsaApiTable->hsa_signal_or_relaxed_fn(signal, value); +} + +void HSA_API + hsa_signal_or_acquire(hsa_signal_t signal, hsa_signal_value_t value) { + return HsaApiTable->hsa_signal_or_acquire_fn(signal, value); +} + +void HSA_API + hsa_signal_or_release(hsa_signal_t signal, hsa_signal_value_t value) { + return HsaApiTable->hsa_signal_or_release_fn(signal, value); +} + +void HSA_API + hsa_signal_or_acq_rel(hsa_signal_t signal, hsa_signal_value_t value) { + return HsaApiTable->hsa_signal_or_acq_rel_fn(signal, value); +} + +void HSA_API + hsa_signal_xor_relaxed(hsa_signal_t signal, hsa_signal_value_t value) { + return HsaApiTable->hsa_signal_xor_relaxed_fn(signal, value); +} + +void HSA_API + hsa_signal_xor_acquire(hsa_signal_t signal, hsa_signal_value_t value) { + return HsaApiTable->hsa_signal_xor_acquire_fn(signal, value); +} + +void HSA_API + hsa_signal_xor_release(hsa_signal_t signal, hsa_signal_value_t value) { + return HsaApiTable->hsa_signal_xor_release_fn(signal, value); +} + +void HSA_API + hsa_signal_xor_acq_rel(hsa_signal_t signal, hsa_signal_value_t value) { + return HsaApiTable->hsa_signal_xor_acq_rel_fn(signal, value); +} + +void HSA_API + hsa_signal_add_relaxed(hsa_signal_t signal, hsa_signal_value_t value) { + return HsaApiTable->hsa_signal_add_relaxed_fn(signal, value); +} + +void HSA_API + hsa_signal_add_acquire(hsa_signal_t signal, hsa_signal_value_t value) { + return HsaApiTable->hsa_signal_add_acquire_fn(signal, value); +} + +void HSA_API + hsa_signal_add_release(hsa_signal_t signal, hsa_signal_value_t value) { + return HsaApiTable->hsa_signal_add_release_fn(signal, value); +} + +void HSA_API + hsa_signal_add_acq_rel(hsa_signal_t signal, hsa_signal_value_t value) { + return HsaApiTable->hsa_signal_add_acq_rel_fn(signal, value); +} + +void HSA_API + hsa_signal_subtract_relaxed(hsa_signal_t signal, hsa_signal_value_t value) { + return HsaApiTable->hsa_signal_subtract_relaxed_fn(signal, value); +} + +void HSA_API + hsa_signal_subtract_acquire(hsa_signal_t signal, hsa_signal_value_t value) { + return HsaApiTable->hsa_signal_subtract_acquire_fn(signal, value); +} + +void HSA_API + hsa_signal_subtract_release(hsa_signal_t signal, hsa_signal_value_t value) { + return HsaApiTable->hsa_signal_subtract_release_fn(signal, value); +} + +void HSA_API + hsa_signal_subtract_acq_rel(hsa_signal_t signal, hsa_signal_value_t value) { + return HsaApiTable->hsa_signal_subtract_acq_rel_fn(signal, value); +} + +hsa_signal_value_t HSA_API + hsa_signal_exchange_relaxed(hsa_signal_t signal, hsa_signal_value_t value) { + return HsaApiTable->hsa_signal_exchange_relaxed_fn(signal, value); +} + +hsa_signal_value_t HSA_API + hsa_signal_exchange_acquire(hsa_signal_t signal, hsa_signal_value_t value) { + return HsaApiTable->hsa_signal_exchange_acquire_fn(signal, value); +} + +hsa_signal_value_t HSA_API + hsa_signal_exchange_release(hsa_signal_t signal, hsa_signal_value_t value) { + return HsaApiTable->hsa_signal_exchange_release_fn(signal, value); +} + +hsa_signal_value_t HSA_API + hsa_signal_exchange_acq_rel(hsa_signal_t signal, hsa_signal_value_t value) { + return HsaApiTable->hsa_signal_exchange_acq_rel_fn(signal, value); +} + +hsa_signal_value_t HSA_API hsa_signal_cas_relaxed(hsa_signal_t signal, + hsa_signal_value_t expected, + hsa_signal_value_t value) { + return HsaApiTable->hsa_signal_cas_relaxed_fn(signal, expected, value); +} + +hsa_signal_value_t HSA_API hsa_signal_cas_acquire(hsa_signal_t signal, + hsa_signal_value_t expected, + hsa_signal_value_t value) { + return HsaApiTable->hsa_signal_cas_acquire_fn(signal, expected, value); +} + +hsa_signal_value_t HSA_API hsa_signal_cas_release(hsa_signal_t signal, + hsa_signal_value_t expected, + hsa_signal_value_t value) { + return HsaApiTable->hsa_signal_cas_release_fn(signal, expected, value); +} + +hsa_signal_value_t HSA_API hsa_signal_cas_acq_rel(hsa_signal_t signal, + hsa_signal_value_t expected, + hsa_signal_value_t value) { + return HsaApiTable->hsa_signal_cas_acq_rel_fn(signal, expected, value); +} + +hsa_status_t hsa_isa_from_name(const char* name, hsa_isa_t* isa) { + return HsaApiTable->hsa_isa_from_name_fn(name, isa); +} + +hsa_status_t HSA_API hsa_isa_get_info(hsa_isa_t isa, hsa_isa_info_t attribute, + uint32_t index, void* value) { + return HsaApiTable->hsa_isa_get_info_fn(isa, attribute, index, value); +} + +hsa_status_t hsa_isa_compatible(hsa_isa_t code_object_isa, hsa_isa_t agent_isa, + bool* result) { + return HsaApiTable->hsa_isa_compatible_fn(code_object_isa, agent_isa, result); +} + +hsa_status_t HSA_API hsa_code_object_serialize( + hsa_code_object_t code_object, + hsa_status_t (*alloc_callback)(size_t size, hsa_callback_data_t data, + void** address), + hsa_callback_data_t callback_data, const char* options, + void** serialized_code_object, size_t* serialized_code_object_size) { + return HsaApiTable->hsa_code_object_serialize_fn( + code_object, alloc_callback, callback_data, options, + serialized_code_object, serialized_code_object_size); +} + +hsa_status_t HSA_API + hsa_code_object_deserialize(void* serialized_code_object, + size_t serialized_code_object_size, + const char* options, + hsa_code_object_t* code_object) { + return HsaApiTable->hsa_code_object_deserialize_fn( + serialized_code_object, serialized_code_object_size, options, + code_object); +} + +hsa_status_t HSA_API hsa_code_object_destroy(hsa_code_object_t code_object) { + return HsaApiTable->hsa_code_object_destroy_fn(code_object); +} + +hsa_status_t HSA_API hsa_code_object_get_info(hsa_code_object_t code_object, + hsa_code_object_info_t attribute, + void* value) { + return HsaApiTable->hsa_code_object_get_info_fn(code_object, attribute, + value); +} + +hsa_status_t HSA_API hsa_code_object_get_symbol(hsa_code_object_t code_object, + const char* symbol_name, + hsa_code_symbol_t* symbol) { + return HsaApiTable->hsa_code_object_get_symbol_fn(code_object, symbol_name, + symbol); +} + +hsa_status_t HSA_API hsa_code_symbol_get_info(hsa_code_symbol_t code_symbol, + hsa_code_symbol_info_t attribute, + void* value) { + return HsaApiTable->hsa_code_symbol_get_info_fn(code_symbol, attribute, + value); +} + +hsa_status_t HSA_API hsa_code_object_iterate_symbols( + hsa_code_object_t code_object, + hsa_status_t (*callback)(hsa_code_object_t code_object, + hsa_code_symbol_t symbol, void* data), + void* data) { + return HsaApiTable->hsa_code_object_iterate_symbols_fn(code_object, callback, + data); +} + +hsa_status_t HSA_API + hsa_executable_create(hsa_profile_t profile, + hsa_executable_state_t executable_state, + const char* options, hsa_executable_t* executable) { + return HsaApiTable->hsa_executable_create_fn(profile, executable_state, + options, executable); +} + +hsa_status_t HSA_API hsa_executable_destroy(hsa_executable_t executable) { + return HsaApiTable->hsa_executable_destroy_fn(executable); +} + +hsa_status_t HSA_API + hsa_executable_load_code_object(hsa_executable_t executable, + hsa_agent_t agent, + hsa_code_object_t code_object, + const char* options) { + return HsaApiTable->hsa_executable_load_code_object_fn(executable, agent, + code_object, options); +} + +hsa_status_t HSA_API + hsa_executable_freeze(hsa_executable_t executable, const char* options) { + return HsaApiTable->hsa_executable_freeze_fn(executable, options); +} + +hsa_status_t HSA_API hsa_executable_get_info(hsa_executable_t executable, + hsa_executable_info_t attribute, + void* value) { + return HsaApiTable->hsa_executable_get_info_fn(executable, attribute, value); +} + +hsa_status_t HSA_API + hsa_executable_global_variable_define(hsa_executable_t executable, + const char* variable_name, + void* address) { + return HsaApiTable->hsa_executable_global_variable_define_fn( + executable, variable_name, address); +} + +hsa_status_t HSA_API + hsa_executable_agent_global_variable_define(hsa_executable_t executable, + hsa_agent_t agent, + const char* variable_name, + void* address) { + return HsaApiTable->hsa_executable_agent_global_variable_define_fn( + executable, agent, variable_name, address); +} + +hsa_status_t HSA_API + hsa_executable_readonly_variable_define(hsa_executable_t executable, + hsa_agent_t agent, + const char* variable_name, + void* address) { + return HsaApiTable->hsa_executable_readonly_variable_define_fn( + executable, agent, variable_name, address); +} + +hsa_status_t HSA_API + hsa_executable_validate(hsa_executable_t executable, uint32_t* result) { + return HsaApiTable->hsa_executable_validate_fn(executable, result); +} + +hsa_status_t HSA_API + hsa_executable_get_symbol(hsa_executable_t executable, + const char* module_name, const char* symbol_name, + hsa_agent_t agent, int32_t call_convention, + hsa_executable_symbol_t* symbol) { + return HsaApiTable->hsa_executable_get_symbol_fn( + executable, module_name, symbol_name, agent, call_convention, symbol); +} + +hsa_status_t HSA_API + hsa_executable_symbol_get_info(hsa_executable_symbol_t executable_symbol, + hsa_executable_symbol_info_t attribute, + void* value) { + return HsaApiTable->hsa_executable_symbol_get_info_fn(executable_symbol, + attribute, value); +} + +hsa_status_t HSA_API hsa_executable_iterate_symbols( + hsa_executable_t executable, + hsa_status_t (*callback)(hsa_executable_t executable, + hsa_executable_symbol_t symbol, void* data), + void* data) { + return HsaApiTable->hsa_executable_iterate_symbols_fn(executable, callback, + data); +} + +hsa_status_t HSA_API + hsa_status_string(hsa_status_t status, const char** status_string) { + return HsaApiTable->hsa_status_string_fn(status, status_string); +} diff --git a/runtime/hsa-runtime/core/common/shared.cpp b/runtime/hsa-runtime/core/common/shared.cpp new file mode 100644 index 0000000000..07dbc89f19 --- /dev/null +++ b/runtime/hsa-runtime/core/common/shared.cpp @@ -0,0 +1,48 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#include "core/common/shared.h" + +namespace core { +std::function BaseShared::allocate_=nullptr; +std::function BaseShared::free_=nullptr; +} diff --git a/runtime/hsa-runtime/core/common/shared.h b/runtime/hsa-runtime/core/common/shared.h new file mode 100644 index 0000000000..36edaa078c --- /dev/null +++ b/runtime/hsa-runtime/core/common/shared.h @@ -0,0 +1,109 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef HSA_RUNTME_CORE_INC_SHARED_H_ +#define HSA_RUNTME_CORE_INC_SHARED_H_ + +#include "core/util/utils.h" +#include + +#include +#include + +namespace core { +/// @brief Base class encapsulating the allocator and deallocator for +/// shared shared object. +class BaseShared { + public: + static void SetAllocateAndFree( + const std::function& allocate, + const std::function& free) { + allocate_ = allocate; + free_ = free; + } + + protected: + static std::function allocate_; + static std::function free_; +}; + +/// @brief Base class for classes that encapsulates object shared between +/// host and agents. Alignment defaults to __alignof(T) but may be increased. +template +class Shared : public BaseShared { + public: + Shared() { + assert(allocate_ != nullptr && free_ != nullptr && + "Shared object allocator is not set"); + static_assert((__alignof(T) <= Align) || (Align == 0), + "Align is less than alignof(T)"); + + shared_object_ = + reinterpret_cast(allocate_(sizeof(T), Max(__alignof(T), Align))); + + assert(shared_object_ != NULL && "Failed on allocating shared_object_"); + + if (shared_object_ != NULL) new (shared_object_) T; + } + + virtual ~Shared() { + assert(allocate_ != nullptr && free_ != nullptr && + "Shared object allocator is not set"); + + if (IsSharedObjectAllocationValid()) { + shared_object_->~T(); + free_(shared_object_); + } + } + + T* shared_object() const { return shared_object_; } + + bool IsSharedObjectAllocationValid() const { + return (shared_object_ != NULL); + } + + private: + T* shared_object_; +}; + +} // namespace core +#endif // header guard diff --git a/runtime/hsa-runtime/core/hsacore.so.def b/runtime/hsa-runtime/core/hsacore.so.def new file mode 100644 index 0000000000..769809dc96 --- /dev/null +++ b/runtime/hsa-runtime/core/hsacore.so.def @@ -0,0 +1,143 @@ +{ +global: + hsa_init; + hsa_shut_down; + hsa_system_get_info; + hsa_system_extension_supported; + hsa_system_get_extension_table; + hsa_iterate_agents; + hsa_agent_get_info; + hsa_agent_get_exception_policies; + hsa_agent_extension_supported; + hsa_queue_create; + hsa_soft_queue_create; + hsa_queue_destroy; + hsa_queue_inactivate; + hsa_queue_load_read_index_acquire; + hsa_queue_load_read_index_relaxed; + hsa_queue_load_write_index_acquire; + hsa_queue_load_write_index_relaxed; + hsa_queue_store_write_index_relaxed; + hsa_queue_store_write_index_release; + hsa_queue_cas_write_index_acq_rel; + hsa_queue_cas_write_index_acquire; + hsa_queue_cas_write_index_relaxed; + hsa_queue_cas_write_index_release; + hsa_queue_add_write_index_acq_rel; + hsa_queue_add_write_index_acquire; + hsa_queue_add_write_index_relaxed; + hsa_queue_add_write_index_release; + hsa_queue_store_read_index_relaxed; + hsa_queue_store_read_index_release; + hsa_agent_iterate_regions; + hsa_region_get_info; + hsa_memory_register; + hsa_memory_deregister; + hsa_memory_allocate; + hsa_memory_free; + hsa_memory_copy; + hsa_memory_assign_agent; + hsa_signal_create; + hsa_signal_destroy; + hsa_signal_load_relaxed; + hsa_signal_load_acquire; + hsa_signal_store_relaxed; + hsa_signal_store_release; + hsa_signal_wait_relaxed; + hsa_signal_wait_acquire; + hsa_signal_and_relaxed; + hsa_signal_and_acquire; + hsa_signal_and_release; + hsa_signal_and_acq_rel; + hsa_signal_or_relaxed; + hsa_signal_or_acquire; + hsa_signal_or_release; + hsa_signal_or_acq_rel; + hsa_signal_xor_relaxed; + hsa_signal_xor_acquire; + hsa_signal_xor_release; + hsa_signal_xor_acq_rel; + hsa_signal_exchange_relaxed; + hsa_signal_exchange_acquire; + hsa_signal_exchange_release; + hsa_signal_exchange_acq_rel; + hsa_signal_add_relaxed; + hsa_signal_add_acquire; + hsa_signal_add_release; + hsa_signal_add_acq_rel; + hsa_signal_subtract_relaxed; + hsa_signal_subtract_acquire; + hsa_signal_subtract_release; + hsa_signal_subtract_acq_rel; + hsa_signal_cas_relaxed; + hsa_signal_cas_acquire; + hsa_signal_cas_release; + hsa_signal_cas_acq_rel; + hsa_isa_from_name; + hsa_isa_get_info; + hsa_isa_compatible; + hsa_code_object_serialize; + hsa_code_object_deserialize; + hsa_code_object_destroy; + hsa_code_object_get_info; + hsa_code_object_get_symbol; + hsa_code_symbol_get_info; + hsa_code_object_iterate_symbols; + hsa_executable_create; + hsa_executable_destroy; + hsa_executable_load_code_object; + hsa_executable_freeze; + hsa_executable_get_info; + hsa_executable_global_variable_define; + hsa_executable_agent_global_variable_define; + hsa_executable_readonly_variable_define; + hsa_executable_validate; + hsa_executable_get_symbol; + hsa_executable_symbol_get_info; + hsa_executable_iterate_symbols; + hsa_status_string; + hsa_ext_program_create; + hsa_ext_program_destroy; + hsa_ext_program_add_module; + hsa_ext_program_iterate_modules; + hsa_ext_program_get_info; + hsa_ext_program_finalize; + hsa_amd_coherency_get_type; + hsa_amd_coherency_set_type; + hsa_amd_profiling_set_profiler_enabled; + hsa_amd_profiling_get_dispatch_time; + hsa_amd_profiling_convert_tick_to_system_domain; + hsa_amd_signal_wait_any; + hsa_amd_signal_async_handler; + hsa_amd_async_function; + hsa_amd_image_get_info_max_dim; + hsa_amd_queue_cu_set_mask; + hsa_amd_memory_fill; + hsa_amd_memory_async_copy; + hsa_amd_memory_lock; + hsa_amd_memory_unlock; + hsa_amd_agent_iterate_memory_pools; + hsa_amd_agent_memory_pool_get_info; + hsa_amd_agents_allow_access; + hsa_amd_memory_pool_get_info; + hsa_amd_memory_pool_allocate; + hsa_amd_memory_pool_free; + hsa_amd_memory_pool_can_migrate; + hsa_amd_memory_migrate; + hsa_amd_interop_map_buffer; + hsa_amd_interop_unmap_buffer; + hsa_amd_image_create; + hsa_ext_image_get_capability; + hsa_ext_image_data_get_info; + hsa_ext_image_create; + hsa_ext_image_import; + hsa_ext_image_export; + hsa_ext_image_copy; + hsa_ext_image_clear; + hsa_ext_image_destroy; + hsa_ext_sampler_create; + hsa_ext_sampler_destroy; + +local: + *; +}; diff --git a/runtime/hsa-runtime/core/inc/agent.h b/runtime/hsa-runtime/core/inc/agent.h new file mode 100644 index 0000000000..53ecd355b6 --- /dev/null +++ b/runtime/hsa-runtime/core/inc/agent.h @@ -0,0 +1,264 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +// HSA runtime C++ interface file. + +#ifndef HSA_RUNTME_CORE_INC_AGENT_H_ +#define HSA_RUNTME_CORE_INC_AGENT_H_ + +#include + +#include + +#include "core/inc/runtime.h" +#include "core/inc/checked.h" +#include "core/inc/isa.h" +#include "core/inc/queue.h" +#include "core/inc/memory_region.h" +#include "core/util/utils.h" + +namespace core { +class Signal; + +typedef void (*HsaEventCallback)(hsa_status_t status, hsa_queue_t* source, + void* data); + +class MemoryRegion; + +// Agent is intended to be an pure interface class and may be wrapped or +// replaced by tools libraries. All funtions other than Convert, node_id, +// device_type, and public_handle must be virtual. +class Agent : public Checked<0xF6BC25EB17E6F917> { + public: + // @brief Convert agent object into hsa_agent_t. + // + // @param [in] agent Pointer to an agent. + // + // @retval hsa_agent_t + static __forceinline hsa_agent_t Convert(Agent* agent) { + const hsa_agent_t agent_handle = { + static_cast(reinterpret_cast(agent))}; + return agent_handle; + } + + // @brief Convert agent object into const hsa_agent_t. + // + // @param [in] agent Pointer to an agent. + // + // @retval const hsa_agent_t + static __forceinline const hsa_agent_t Convert(const Agent* agent) { + const hsa_agent_t agent_handle = { + static_cast(reinterpret_cast(agent))}; + return agent_handle; + } + + // @brief Convert hsa_agent_t handle into Agent*. + // + // @param [in] agent An hsa_agent_t handle. + // + // @retval Agent* + static __forceinline Agent* Convert(hsa_agent_t agent) { + return reinterpret_cast(agent.handle); + } + + // Lightweight RTTI for vendor specific implementations. + enum DeviceType { kAmdGpuDevice = 0, kAmdCpuDevice = 1, kUnknownDevice = 2 }; + + // @brief Agent class contructor. + // + // @param [in] type CPU or GPU or other. + explicit Agent(uint32_t node_id, DeviceType type) + : node_id_(node_id), device_type_(uint32_t(type)) { + public_handle_ = Convert(this); + } + + // @brief Agent class contructor. + // + // @param [in] type CPU or GPU or other. + explicit Agent(uint32_t node_id, uint32_t type) + : node_id_(node_id), device_type_(type) { + public_handle_ = Convert(this); + } + + // @brief Agent class destructor. + virtual ~Agent() {} + + // @brief Submit DMA copy command to move data from src to dst and wait + // until it is finished. + // + // @details The agent must be able to access @p dst and @p src. + // + // @param [in] dst Memory address of the destination. + // @param [in] src Memory address of the source. + // @param [in] size Copy size in bytes. + // + // @retval HSA_STATUS_SUCCESS The memory copy is finished and successful. + virtual hsa_status_t DmaCopy(void* dst, const void* src, size_t size) { + return HSA_STATUS_ERROR; + } + + // @brief Submit DMA copy command to move data from src to dst. This call + // does not wait until the copy is finished + // + // @details The agent must be able to access @p dst and @p src. Memory copy + // will be performed after all signals in @p dep_signals have value of 0. + // On memory copy completion, the value of out_signal is decremented. + // + // @param [in] dst Memory address of the destination. + // @param [in] src Memory address of the source. + // @param [in] size Copy size in bytes. + // @param [in] dep_signals Array of signal dependency. + // @param [in] out_signal Completion signal. + // + // @retval HSA_STATUS_SUCCESS The memory copy is finished and successful. + virtual hsa_status_t DmaCopy(void* dst, const void* src, size_t size, + std::vector& dep_signals, + core::Signal& out_signal) { + return HSA_STATUS_ERROR; + } + + // @brief Submit DMA command to set the content of a pointer and wait + // until it is finished. + // + // @details The agent must be able to access @p ptr + // + // @param [in] ptr Address of the memory to be set. + // @param [in] value The value/pattern that will be used to set @p ptr. + // @param [in] count Number of uint32_t element to be set. + // + // @retval HSA_STATUS_SUCCESS The memory fill is finished and successful. + virtual hsa_status_t DmaFill(void* ptr, uint32_t value, size_t count) { + return HSA_STATUS_ERROR; + } + + // @brief Invoke the user provided callback for each region accessible by + // this agent. + // + // @param [in] callback User provided callback function. + // @param [in] data User provided pointer as input for @p callback. + // + // @retval ::HSA_STATUS_SUCCESS if the callback function for each traversed + // region returns ::HSA_STATUS_SUCCESS. + virtual hsa_status_t IterateRegion( + hsa_status_t (*callback)(hsa_region_t region, void* data), + void* data) const = 0; + + // @brief Create queue. + // + // @param [in] size Number of packets the queue is expected to hold. Must be a + // power of 2 greater than 0. + // @param [in] queue_type Queue type. + // @param [in] event_callback Callback invoked for every + // asynchronous event related to the newly created queue. May be NULL.The HSA + // runtime passes three arguments to the callback : a code identifying the + // event that triggered the invocation, a pointer to the queue where the event + // originated, and the application data. + // @param [in] data Application data that is passed to @p callback. + // @param [in] private_segment_size A hint to indicate the maximum expected + // private segment usage per work-item, in bytes. + // @param [in] group_segment_size A hint to indicate the maximum expected + // group segment usage per work-group, in bytes. + // @param[out] queue Memory location where the HSA runtime stores a pointer + // to the newly created queue. + // + // @retval HSA_STATUS_SUCCESS The queue has been created successfully. + virtual hsa_status_t QueueCreate(size_t size, hsa_queue_type_t queue_type, + HsaEventCallback event_callback, void* data, + uint32_t private_segment_size, + uint32_t group_segment_size, + Queue** queue) = 0; + + // @brief Query the value of an attribute. + // + // @param [in] attribute Attribute to query. + // @param [out] value Pointer to store the value of the attribute. + // + // @param HSA_STATUS_SUCCESS @p value has been filled with the value of the + // attribute. + virtual hsa_status_t GetInfo(hsa_agent_info_t attribute, + void* value) const = 0; + + // @brief Returns an array of regions owned by the agent. + virtual const std::vector& regions() const = 0; + + // @details Returns the agent's instruction set architecture. + virtual const Isa* isa() const = 0; + + // @brief Returns the device type (CPU/GPU/Others). + __forceinline uint32_t device_type() const { return device_type_; } + + // @brief Returns hsa_agent_t handle exposed to end user. + // + // @details Only matters when tools library need to intercept HSA calls. + __forceinline hsa_agent_t public_handle() const { return public_handle_; } + + // @brief Returns node id associated with this agent. + __forceinline uint32_t node_id() const { return node_id_; } + + protected: + // Intention here is to have a polymorphic update procedure for public_handle_ + // which is callable on any Agent* but only from some class dervied from + // Agent*. do_set_public_handle should remain protected or private in all + // derived types. + static __forceinline void set_public_handle(Agent* agent, + hsa_agent_t handle) { + agent->do_set_public_handle(handle); + } + + virtual void do_set_public_handle(hsa_agent_t handle) { + public_handle_ = handle; + } + + hsa_agent_t public_handle_; + + private: + // @brief Node id. + const uint32_t node_id_; + + const uint32_t device_type_; + + // Forbid copying and moving of this object + DISALLOW_COPY_AND_ASSIGN(Agent); +}; +} // namespace core + +#endif // header guard diff --git a/runtime/hsa-runtime/core/inc/amd_aql_queue.h b/runtime/hsa-runtime/core/inc/amd_aql_queue.h new file mode 100644 index 0000000000..25cb252f84 --- /dev/null +++ b/runtime/hsa-runtime/core/inc/amd_aql_queue.h @@ -0,0 +1,412 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef HSA_RUNTIME_CORE_INC_AMD_HW_AQL_COMMAND_PROCESSOR_H_ +#define HSA_RUNTIME_CORE_INC_AMD_HW_AQL_COMMAND_PROCESSOR_H_ + +#include "core/inc/runtime.h" +#include "core/inc/signal.h" +#include "core/inc/queue.h" +#include "core/inc/amd_gpu_agent.h" + +namespace amd { +/// @brief Encapsulates HW Aql Command Processor functionality. It +/// provide the interface for things such as Doorbell register, read, +/// write pointers and a buffer. +class AqlQueue : public core::Queue, public core::Signal { + public: + static __forceinline bool IsType(core::Signal* signal) { + return signal->IsType(&rtti_id_); + } + + // Acquires/releases queue resources and requests HW schedule/deschedule. + AqlQueue(GpuAgent* agent, size_t req_size_pkts, HSAuint32 node_id, + ScratchInfo& scratch, core::HsaEventCallback callback, + void* err_data, bool is_kv = false); + + ~AqlQueue(); + + /// @brief Indicates if queue is valid or not + bool IsValid() const { return valid_; } + + /// @brief Queue interfaces + hsa_status_t Inactivate(); + + /// @brief Atomically reads the Read index of with Acquire semantics + /// + /// @return uint64_t Value of read index + uint64_t LoadReadIndexAcquire(); + + /// @brief Atomically reads the Read index of with Relaxed semantics + /// + /// @return uint64_t Value of read index + uint64_t LoadReadIndexRelaxed(); + + /// @brief Atomically reads the Write index of with Acquire semantics + /// + /// @return uint64_t Value of write index + uint64_t LoadWriteIndexAcquire(); + + /// @brief Atomically reads the Write index of with Relaxed semantics + /// + /// @return uint64_t Value of write index + uint64_t LoadWriteIndexRelaxed(); + + /// @brief This operation is illegal + void StoreReadIndexRelaxed(uint64_t value) { assert(false); } + + /// @brief This operation is illegal + void StoreReadIndexRelease(uint64_t value) { assert(false); } + + /// @brief Atomically writes the Write index of with Relaxed semantics + /// + /// @param value New value of write index to update with + void StoreWriteIndexRelaxed(uint64_t value); + + /// @brief Atomically writes the Write index of with Release semantics + /// + /// @param value New value of write index to update with + void StoreWriteIndexRelease(uint64_t value); + + /// @brief Compares and swaps Write index using Acquire and Release semantics + /// + /// @param expected Current value of write index + /// + /// @param value Value of new write index + /// + /// @return uint64_t Value of write index before the update + uint64_t CasWriteIndexAcqRel(uint64_t expected, uint64_t value); + + /// @brief Compares and swaps Write index using Acquire semantics + /// + /// @param expected Current value of write index + /// + /// @param value Value of new write index + /// + /// @return uint64_t Value of write index before the update + uint64_t CasWriteIndexAcquire(uint64_t expected, uint64_t value); + + /// @brief Compares and swaps Write index using Relaxed semantics + /// + /// @param expected Current value of write index + /// + /// @param value Value of new write index + /// + /// @return uint64_t Value of write index before the update + uint64_t CasWriteIndexRelaxed(uint64_t expected, uint64_t value); + + /// @brief Compares and swaps Write index using Release semantics + /// + /// @param expected Current value of write index + /// + /// @param value Value of new write index + /// + /// @return uint64_t Value of write index before the update + uint64_t CasWriteIndexRelease(uint64_t expected, uint64_t value); + + /// @brief Updates the Write index using Acquire and Release semantics + /// + /// @param value Value of new write index + /// + /// @return uint64_t Value of write index before the update + uint64_t AddWriteIndexAcqRel(uint64_t value); + + /// @brief Updates the Write index using Acquire semantics + /// + /// @param value Value of new write index + /// + /// @return uint64_t Value of write index before the update + uint64_t AddWriteIndexAcquire(uint64_t value); + + /// @brief Updates the Write index using Relaxed semantics + /// + /// @param value Value of new write index + /// + /// @return uint64_t Value of write index before the update + uint64_t AddWriteIndexRelaxed(uint64_t value); + + /// @brief Updates the Write index using Release semantics + /// + /// @param value Value of new write index + /// + /// @return uint64_t Value of write index before the update + uint64_t AddWriteIndexRelease(uint64_t value); + + /// @brief Set CU Masking + /// + /// @param num_cu_mask_count size of mask bit array + /// + /// @param cu_mask pointer to cu mask + /// + /// @return hsa_status_t + hsa_status_t SetCUMasking(const uint32_t num_cu_mask_count, + const uint32_t* cu_mask); + + /// @brief This operation is illegal + hsa_signal_value_t LoadRelaxed() { + assert(false); + return 0; + } + + /// @brief This operation is illegal + hsa_signal_value_t LoadAcquire() { + assert(false); + return 0; + } + + /// @brief Update signal value using Relaxed semantics + void StoreRelaxed(hsa_signal_value_t value); + + /// @brief Update signal value using Release semantics + void StoreRelease(hsa_signal_value_t value); + + /// @brief This operation is illegal + hsa_signal_value_t WaitRelaxed(hsa_signal_condition_t condition, + hsa_signal_value_t compare_value, + uint64_t timeout, hsa_wait_state_t wait_hint) { + assert(false); + return 0; + } + + /// @brief This operation is illegal + hsa_signal_value_t WaitAcquire(hsa_signal_condition_t condition, + hsa_signal_value_t compare_value, + uint64_t timeout, hsa_wait_state_t wait_hint) { + assert(false); + return 0; + } + + /// @brief This operation is illegal + void AndRelaxed(hsa_signal_value_t value) { assert(false); } + + /// @brief This operation is illegal + void AndAcquire(hsa_signal_value_t value) { assert(false); } + + /// @brief This operation is illegal + void AndRelease(hsa_signal_value_t value) { assert(false); } + + /// @brief This operation is illegal + void AndAcqRel(hsa_signal_value_t value) { assert(false); } + + /// @brief This operation is illegal + void OrRelaxed(hsa_signal_value_t value) { assert(false); } + + /// @brief This operation is illegal + void OrAcquire(hsa_signal_value_t value) { assert(false); } + + /// @brief This operation is illegal + void OrRelease(hsa_signal_value_t value) { assert(false); } + + /// @brief This operation is illegal + void OrAcqRel(hsa_signal_value_t value) { assert(false); } + + /// @brief This operation is illegal + void XorRelaxed(hsa_signal_value_t value) { assert(false); } + + /// @brief This operation is illegal + void XorAcquire(hsa_signal_value_t value) { assert(false); } + + /// @brief This operation is illegal + void XorRelease(hsa_signal_value_t value) { assert(false); } + + /// @brief This operation is illegal + void XorAcqRel(hsa_signal_value_t value) { assert(false); } + + /// @brief This operation is illegal + void AddRelaxed(hsa_signal_value_t value) { assert(false); } + + /// @brief This operation is illegal + void AddAcquire(hsa_signal_value_t value) { assert(false); } + + /// @brief This operation is illegal + void AddRelease(hsa_signal_value_t value) { assert(false); } + + /// @brief This operation is illegal + void AddAcqRel(hsa_signal_value_t value) { assert(false); } + + /// @brief This operation is illegal + void SubRelaxed(hsa_signal_value_t value) { assert(false); } + + /// @brief This operation is illegal + void SubAcquire(hsa_signal_value_t value) { assert(false); } + + /// @brief This operation is illegal + void SubRelease(hsa_signal_value_t value) { assert(false); } + + /// @brief This operation is illegal + void SubAcqRel(hsa_signal_value_t value) { assert(false); } + + /// @brief This operation is illegal + hsa_signal_value_t ExchRelaxed(hsa_signal_value_t value) { + assert(false); + return 0; + } + + /// @brief This operation is illegal + hsa_signal_value_t ExchAcquire(hsa_signal_value_t value) { + assert(false); + return 0; + } + + /// @brief This operation is illegal + hsa_signal_value_t ExchRelease(hsa_signal_value_t value) { + assert(false); + return 0; + } + + /// @brief This operation is illegal + hsa_signal_value_t ExchAcqRel(hsa_signal_value_t value) { + assert(false); + return 0; + } + + /// @brief This operation is illegal + hsa_signal_value_t CasRelaxed(hsa_signal_value_t expected, + hsa_signal_value_t value) { + assert(false); + return 0; + } + + /// @brief This operation is illegal + hsa_signal_value_t CasAcquire(hsa_signal_value_t expected, + hsa_signal_value_t value) { + assert(false); + return 0; + } + + /// @brief This operation is illegal + hsa_signal_value_t CasRelease(hsa_signal_value_t expected, + hsa_signal_value_t value) { + assert(false); + return 0; + } + + /// @brief This operation is illegal + hsa_signal_value_t CasAcqRel(hsa_signal_value_t expected, + hsa_signal_value_t value) { + assert(false); + return 0; + } + + /// @brief This operation is illegal + hsa_signal_value_t* ValueLocation() const { + assert(false); + return NULL; + } + + /// @brief This operation is illegal + HsaEvent* EopEvent() { + assert(false); + return NULL; + } + + // 64 byte-aligned allocation and release, for Queue::amd_queue_. + void* operator new(size_t size); + void* operator new(size_t size, void* ptr) { return ptr; } + void operator delete(void* ptr); + void operator delete(void*, void*) {} + + protected: + bool _IsA(rtti_t id) const { return id == &rtti_id_; } + + private: + uint32_t ComputeRingBufferMinPkts(); + uint32_t ComputeRingBufferMaxPkts(); + + // (De)allocates and (de)registers ring_buf_. + void AllocRegisteredRingBuffer(uint32_t queue_size_pkts); + void FreeRegisteredRingBuffer(); + + static bool DynamicScratchHandler(hsa_signal_value_t error_code, void* arg); + + // AQL packet ring buffer + void* ring_buf_; + + // Size of ring_buf_ allocation. + // This may be larger than (amd_queue_.hsa_queue.size * sizeof(AqlPacket)). + uint32_t ring_buf_alloc_bytes_; + + // Id of the Queue used in communication with thunk + HSA_QUEUEID queue_id_; + + // Indicates is queue is valid + bool valid_; + + // Indicates if queue is inactive + int32_t active_; + + // Cached value of HsaNodeProperties.HSA_CAPABILITY.DoorbellType + int doorbell_type_; + + // Handle of agent, which queue is attached to + GpuAgent* agent_; + + hsa_profile_t agent_profile_; + + uint32_t queue_full_workaround_; + + // Handle of scratch memory descriptor + ScratchInfo queue_scratch_; + + core::HsaEventCallback errors_callback_; + + void* errors_data_; + + // Is KV device queue + bool is_kv_queue_; + + // Shared event used for queue errors + static HsaEvent* queue_event_; + + // Queue count - used to ref count queue_event_ + static volatile uint32_t queue_count_; + + // Mutex for queue_event_ manipulation + static KernelMutex queue_lock_; + + static int rtti_id_; + + // Forbid copying and moving of this object + DISALLOW_COPY_AND_ASSIGN(AqlQueue); +}; +} // namespace amd +#endif // header guard diff --git a/runtime/hsa-runtime/core/inc/amd_blit_kernel.h b/runtime/hsa-runtime/core/inc/amd_blit_kernel.h new file mode 100644 index 0000000000..665b7e7dc3 --- /dev/null +++ b/runtime/hsa-runtime/core/inc/amd_blit_kernel.h @@ -0,0 +1,174 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef HSA_RUNTIME_CORE_INC_AMD_BLIT_KERNEL_H_ +#define HSA_RUNTIME_CORE_INC_AMD_BLIT_KERNEL_H_ + +#include + +#include "core/inc/blit.h" + +namespace amd { +class BlitKernel : public core::Blit { + public: + explicit BlitKernel(); + virtual ~BlitKernel() override; + + /// @brief Initialize a blit kernel object. + /// + /// @param agent Pointer to the agent that will execute the AQL packets. + /// + /// @return hsa_status_t + virtual hsa_status_t Initialize(const core::Agent& agent) override; + + /// @brief Marks the blit kernel object as invalid and uncouples its link with + /// the underlying AQL kernel queue. Use of the blit object + /// once it has been release is illegal and any behavior is indeterminate + /// + /// @note: The call will block until all AQL packets have been executed. + /// + /// @return hsa_status_t + virtual hsa_status_t Destroy() override; + + /// @brief Submit an AQL packet to perform vector copy. The call is blocking + /// until the command execution is finished. + /// + /// @param dst Memory address of the copy destination. + /// @param src Memory address of the copy source. + /// @param size Size of the data to be copied. + virtual hsa_status_t SubmitLinearCopyCommand(void* dst, const void* src, + size_t size) override; + + /// @brief Submit a linear copy command to the the underlying compute device's + /// control block. The call is non blocking. The memory transfer will start + /// after all dependent signals are satisfied. After the transfer is + /// completed, the out signal will be decremented. + /// + /// @param dst Memory address of the copy destination. + /// @param src Memory address of the copy source. + /// @param size Size of the data to be copied. + /// @param dep_signals Arrays of dependent signal. + /// @param out_signal Output signal. + virtual hsa_status_t SubmitLinearCopyCommand( + void* dst, const void* src, size_t size, + std::vector& dep_signals, + core::Signal& out_signal) override; + + /// @brief Submit an AQL packet to perform memory fill. The call is blocking + /// until the command execution is finished. + /// + /// @param ptr Memory address of the fill destination. + /// @param value Value to be set. + /// @param count Number of uint32_t element to be set to the value. + virtual hsa_status_t SubmitLinearFillCommand(void* ptr, uint32_t value, + size_t count) override; + + private: + struct __ALIGNED__(16) KernelCopyArgs { + const void* src; + void* dst; + uint64_t size; + uint32_t use_vector; + }; + + struct __ALIGNED__(16) KernelFillArgs { + void* ptr; + uint64_t num; + uint32_t value; + }; + + /// Reserve a slot in the queue buffer. The call will wait until the queue + /// buffer has a room. + uint64_t AcquireWriteIndex(uint32_t num_packet); + + /// Update the queue doorbell register with ::write_index. This + /// function also serializes concurrent doorbell update to ensure that the + /// packet processor doesn't get invalid packet. + void ReleaseWriteIndex(uint64_t write_index, uint32_t num_packet); + + /// Wait until all packets are finished. + hsa_status_t FenceRelease(uint64_t write_index, uint32_t num_copy_packet, + hsa_fence_scope_t fence); + + void PopulateQueue(uint64_t index, uint64_t code_handle, void* args, + uint32_t grid_size_x, hsa_signal_t completion_signal); + + KernelCopyArgs* ObtainAsyncKernelCopyArg(); + + /// Handles to the vector copy kernel. + uint64_t copy_code_handle_; + + /// Handles to the vector copy aligned kernel. + uint64_t copy_aligned_code_handle_; + + /// Handles to the fill memory kernel. + uint64_t fill_code_handle_; + + /// AQL queue for submitting the vector copy kernel. + hsa_queue_t* queue_; + uint32_t queue_bitmask_; + + /// Index to track concurrent kernel launch. + volatile uint64_t cached_index_; + + /// Pointer to the kernel argument buffer. + void* kernarg_; + KernelCopyArgs* kernarg_async_; + uint32_t kernarg_async_mask_; + volatile uint32_t kernarg_async_counter_; + + /// Completion signal for every kernel dispatched. + hsa_signal_t completion_signal_; + + /// Lock to synchronize access to kernarg_ and completion_signal_ + std::mutex lock_; + + /// Pointer to memory containing the ISA and argument buffer. + void* code_arg_buffer_; + + static const size_t kMaxCopyCount; + static const size_t kMaxFillCount; + static const uint32_t kGroupSize; +}; +} // namespace amd + +#endif // header guard diff --git a/runtime/hsa-runtime/core/inc/amd_blit_kernel_kv.h b/runtime/hsa-runtime/core/inc/amd_blit_kernel_kv.h new file mode 100644 index 0000000000..55ab3c8031 --- /dev/null +++ b/runtime/hsa-runtime/core/inc/amd_blit_kernel_kv.h @@ -0,0 +1,479 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef HSA_RUNTIME_CORE_INC_AMD_BLIT_KERNEL_KV_H_ +#define HSA_RUNTIME_CORE_INC_AMD_BLIT_KERNEL_KV_H_ + +#include + +#define HSA_VECTOR_COPY_KV_AKC_SIZE 368 +#define HSA_VECTOR_COPY_KV_AKC_OFFSET 256 + +/*****HSAIL code of the ISA in ::kVectorCopyRawKv. +module &m:1:0:$full:$large:$default; + +prog kernel &__vector_copy_kernel( + kernarg_u64 %src, + kernarg_u64 %dst, + kernarg_u64 %size) +{ + @__vector_copy_kernel_entry: + // BB#0: // %entry + workitemabsid_u32 $s0, 0; + cvt_u64_u32 $d0, $s0; + ld_kernarg_align(8)_width(all)_u64 $d1, [%size]; + cmp_ge_b1_u64 $c0, $d0, $d1; + cbr_b1 $c0, @BB0_2; + // BB#1: // %if.end + ld_kernarg_align(8)_width(all)_u64 $d1, [%src]; + ld_kernarg_align(8)_width(all)_u64 $d2, [%dst]; + add_u64 $d2, $d2, $d0; + add_u64 $d0, $d1, $d0; + ld_global_u8 $s0, [$d0]; + st_global_u8 $s0, [$d2]; + + @BB0_2: + // %return + ret; +}; +*/ + +static char kVectorCopyRawKv[] = { + 127, 69, 76, 70, 2, 1, 1, 64, 0, 0, 0, 0, 0, + 0, 0, 0, 1, 0, -32, 0, 1, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 64, 0, 0, 0, 0, 0, 0, + 0, -104, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 64, 0, 56, 0, 1, 0, 64, 0, 6, 0, 5, 0, 3, + 0, 0, 96, 6, 0, 0, 0, 0, 1, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 112, 1, 0, 0, 0, 0, 0, 0, + 112, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, + 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, + 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 65, 0, -116, 0, -112, 0, 0, 0, + 11, 0, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 24, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 11, 0, 5, 0, 5, 0, 0, 0, 9, 0, 0, + 0, 0, 0, 0, 0, 3, 0, 0, 6, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 1, 5, 0, -64, 127, 0, -116, -65, + 0, -1, -128, -109, 0, 0, 16, 0, 0, 8, 0, -109, 0, + 0, 0, 74, 4, 7, 64, -64, -128, 2, 2, 126, 127, 0, + -116, -65, 0, 0, -56, 125, 106, 36, -128, -66, 15, 0, -120, + -65, 0, 7, -126, -64, 127, 0, -116, -65, 4, 0, 2, 74, + 5, 2, 4, 126, 2, 106, 80, -46, 2, 1, -87, 1, 0, + 0, 32, -36, 1, 0, 0, 1, 6, 0, 6, 74, 7, 2, + 4, 126, 4, 106, 80, -46, 2, 1, -87, 1, 112, 0, -116, + -65, 0, 0, 96, -36, 3, 1, 0, 0, 0, 0, -127, -65, + 3, 0, 0, 0, 8, 0, 0, 0, 1, 0, 0, 0, 65, + 77, 68, 0, 1, 0, 0, 0, 0, 0, 0, 0, 3, 0, + 0, 0, 12, 0, 0, 0, 2, 0, 0, 0, 65, 77, 68, + 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, + 3, 0, 0, 0, 28, 0, 0, 0, 3, 0, 0, 0, 65, + 77, 68, 0, 4, 0, 7, 0, 7, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 65, 77, 68, 0, 65, 77, 68, + 71, 80, 85, 0, 0, 3, 0, 0, 0, 40, 0, 0, 0, + 4, 0, 0, 0, 65, 77, 68, 0, 26, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 65, 77, 68, 32, 72, 83, + 65, 32, 82, 117, 110, 116, 105, 109, 101, 32, 70, 105, 110, + 97, 108, 105, 122, 101, 114, 0, 0, 0, 38, 95, 95, 118, + 101, 99, 116, 111, 114, 95, 99, 111, 112, 121, 95, 107, 101, + 114, 110, 101, 108, 0, 95, 95, 104, 115, 97, 95, 115, 101, + 99, 116, 105, 111, 110, 46, 104, 115, 97, 116, 101, 120, 116, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 26, 0, 1, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 22, 0, 0, 0, 3, 0, 1, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 46, 104, 115, 97, 116, 101, 120, 116, 0, 46, 110, + 111, 116, 101, 0, 46, 115, 116, 114, 116, 97, 98, 0, 46, + 115, 121, 109, 116, 97, 98, 0, 46, 115, 104, 115, 116, 114, + 116, 97, 98, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, + 1, 0, 0, 0, 7, 0, -64, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, + 0, 0, 112, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 10, 0, 0, 0, 7, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 112, 2, 0, 0, 0, 0, 0, + 0, -104, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 16, 0, 0, 0, 3, 0, + 0, 0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 8, 3, 0, 0, 0, 0, 0, 0, + 44, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 24, 0, 0, 0, 2, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 56, 3, 0, 0, 0, 0, 0, 0, 48, + 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, + 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 24, 0, 0, + 0, 0, 0, 0, 0, 32, 0, 0, 0, 3, 0, 0, 0, + 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 104, 3, 0, 0, 0, 0, 0, 0, 42, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, +}; +extern char* const kVectorCopyKvObject = &kVectorCopyRawKv[0]; +extern size_t const kVectorCopyKvObjectSize = sizeof(kVectorCopyRawKv); + +#define HSA_VECTOR_COPY_ALIGNED_KV_AKC_SIZE 436 +#define HSA_VECTOR_COPY_ALIGNED_KV_AKC_OFFSET 256 + +/*****HSAIL code of the ISA in ::kVectorCopyAlignedRawKv. +module &m:1:0:$full:$large:$default; +extension "amd:gcn"; + +prog kernel &__copy_buffer_aligned_kernel( + kernarg_u64 %src, + kernarg_u64 %dst, + kernarg_u64 %size, + kernarg_u32 %use_vector) +{ + @__copy_buffer_aligned_kernel_entry: + // BB#0: // %entry + workitemabsid_u32 $s0, 0; + cvt_u64_u32 $d0, $s0; + ld_kernarg_align(8)_width(all)_u64 $d1, [%size]; + cmp_ge_b1_u64 $c0, $d0, $d1; + cbr_b1 $c0, @LBB0_4; + // BB#1: // %if.end + ld_kernarg_align(8)_width(all)_u64 $d2, [%dst]; + ld_kernarg_align(8)_width(all)_u64 $d1, [%src]; + ld_kernarg_align(4)_width(all)_u32 $s0, [%use_vector]; + cmp_ne_b1_s32 $c0, $s0, 1; + cbr_b1 $c0, @LBB0_3; + // BB#2: // %if.then2 + shl_u64 $d0, $d0, 4; + add_u64 $d2, $d2, $d0; + add_u64 $d0, $d1, $d0; + ld_v4_global_align(16)_const_u32 ($s0, $s1, $s2, $s3), [$d0]; + st_v4_global_align(16)_u32 ($s0, $s1, $s2, $s3), [$d2]; + br @LBB0_4; + + @LBB0_3: + // %if.else + shl_u64 $d0, $d0, 2; + add_u64 $d2, $d2, $d0; + add_u64 $d0, $d1, $d0; + ld_global_align(4)_const_u32 $s0, [$d0]; + st_global_align(4)_u32 $s0, [$d2]; + + @LBB0_4: + // %if.end6 + ret; +}; +*/ + +static char kVectorCopyAlignedRawKv[] = { + 127, 69, 76, 70, 2, 1, 1, 64, 0, 0, 0, 0, 0, + 0, 0, 0, 1, 0, -32, 0, 1, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 64, 0, 0, 0, 0, 0, 0, + 0, -8, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 64, 0, 56, 0, 1, 0, 64, 0, 6, 0, 5, 0, 3, + 0, 0, 96, 6, 0, 0, 0, 0, 1, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, -76, 1, 0, 0, 0, 0, 0, 0, + -76, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, + 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, + 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 65, 0, -84, 0, -112, 0, 0, 0, + 11, 0, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 11, 0, 7, 0, 7, 0, 0, 0, 9, 0, 0, + 0, 0, 0, 0, 0, 4, 4, 4, 6, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 1, 5, 0, -64, 127, 0, -116, -65, + 0, -1, -128, -109, 0, 0, 16, 0, 0, 8, 0, -109, 0, + 0, 0, 74, 4, 7, 64, -64, -128, 2, 2, 126, 127, 0, + -116, -65, 0, 0, -56, 125, 106, 36, -128, -66, 32, 0, -120, + -65, 6, 7, 1, -64, 0, 7, -126, -64, 127, 0, -116, -65, + 2, -127, 0, -65, 14, 0, -124, -65, 0, 0, -62, -46, 0, + 9, 1, 0, 4, 0, 4, 74, 5, 2, 6, 126, 3, 3, + 6, 80, 0, 0, 56, -36, 2, 0, 0, 2, 6, 0, 0, + 74, 7, 2, 12, 126, 6, 3, 2, 80, 112, 0, -116, -65, + 0, 0, 120, -36, 0, 2, 0, 0, 13, 0, -126, -65, 0, + 0, -62, -46, 0, 5, 1, 0, 4, 0, 4, 74, 5, 2, + 6, 126, 3, 3, 6, 80, 0, 0, 48, -36, 2, 0, 0, + 2, 6, 0, 0, 74, 7, 2, 6, 126, 3, 3, 2, 80, + 112, 0, -116, -65, 0, 0, 112, -36, 0, 2, 0, 0, 0, + 0, -127, -65, 0, 0, 0, 0, 4, 0, 0, 0, 8, 0, + 0, 0, 1, 0, 0, 0, 65, 77, 68, 0, 1, 0, 0, + 0, 0, 0, 0, 0, 4, 0, 0, 0, 12, 0, 0, 0, + 2, 0, 0, 0, 65, 77, 68, 0, 1, 0, 0, 0, 0, + 0, 0, 0, 1, 1, 1, 0, 4, 0, 0, 0, 25, 0, + 0, 0, 5, 0, 0, 0, 65, 77, 68, 0, 22, 0, 45, + 104, 115, 97, 95, 99, 97, 108, 108, 95, 99, 111, 110, 118, + 101, 110, 116, 105, 111, 110, 61, 0, 0, 0, 0, 0, 4, + 0, 0, 0, 30, 0, 0, 0, 3, 0, 0, 0, 65, 77, + 68, 0, 4, 0, 7, 0, 7, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 65, 77, 68, 0, 65, 77, 68, 71, + 80, 85, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 8, + 0, 0, 0, 4, 0, 0, 0, 65, 77, 68, 0, -32, 101, + -118, -12, -1, 127, 0, 0, 38, 95, 95, 99, 111, 112, 121, + 95, 98, 117, 102, 102, 101, 114, 95, 97, 108, 105, 103, 110, + 101, 100, 95, 107, 101, 114, 110, 101, 108, 0, 95, 95, 104, + 115, 97, 95, 115, 101, 99, 116, 105, 111, 110, 46, 104, 115, + 97, 116, 101, 120, 116, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 26, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, + -76, 1, 0, 0, 0, 0, 0, 0, 30, 0, 0, 0, 3, + 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 46, 104, 115, 97, 116, 101, + 120, 116, 0, 46, 110, 111, 116, 101, 0, 46, 115, 116, 114, + 116, 97, 98, 0, 46, 115, 121, 109, 116, 97, 98, 0, 46, + 115, 104, 115, 116, 114, 116, 97, 98, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 0, 0, 0, 1, 0, 0, 0, 7, 0, -64, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 1, 0, 0, 0, 0, 0, 0, -76, 1, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 10, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -72, 2, + 0, 0, 0, 0, 0, 0, -88, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16, + 0, 0, 0, 3, 0, 0, 0, 32, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 96, 3, 0, + 0, 0, 0, 0, 0, 52, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 24, 0, + 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, -104, 3, 0, 0, + 0, 0, 0, 0, 48, 0, 0, 0, 0, 0, 0, 0, 3, + 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, + 0, 0, 24, 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, + 0, 3, 0, 0, 0, 32, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, -56, 3, 0, 0, 0, + 0, 0, 0, 42, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, +}; +extern char* const kVectorCopyAlignedKvObject = &kVectorCopyAlignedRawKv[0]; +extern size_t const kVectorCopyAlignedKvObjectSize = + sizeof(kVectorCopyAlignedRawKv); + +#define HSA_FILL_MEMORY_KV_AKC_SIZE 352 +#define HSA_FILL_MEMORY_KV_AKC_OFFSET 256 + +/*****HSAIL code of the ISA in ::kFillMemoryRawKv. +module &m:1:0:$full:$large:$default; +extension "amd:gcn"; + +prog kernel &__fill_memory_kernel( +kernarg_u64 %ptr, +kernarg_u64 %num, +kernarg_u32 %value) +{ +@__fill_memory_kernel_entry: +// BB#0: // %entry +workitemabsid_u32 $s0, 0; +cvt_u64_u32 $d0, $s0; +ld_kernarg_align(8)_width(all)_u64 $d1, [%num]; +cmp_ge_b1_u64 $c0, $d0, $d1; +cbr_b1 $c0, @LBB0_2; +// BB#1: // %if.end +ld_kernarg_align(8)_width(all)_u64 $d1, [%ptr]; +ld_kernarg_align(4)_width(all)_u32 $s0, [%value]; +shl_u64 $d0, $d0, 2; +add_u64 $d0, $d1, $d0; +st_global_align(4)_u32 $s0, [$d0]; + +@LBB0_2: +// %return +ret; +}; +*/ + +static char kFillMemoryRawKv[] = { + 127, 69, 76, 70, 2, 1, 1, 64, 0, 0, 0, 0, 0, 0, + 0, 0, 1, 0, -32, 0, 1, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 64, 0, 0, 0, 0, 0, 0, 0, -104, 3, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 64, 0, 56, 0, + 1, 0, 64, 0, 6, 0, 5, 0, 3, 0, 0, 96, 6, 0, + 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 96, 1, + 0, 0, 0, 0, 0, 0, 96, 1, 0, 0, 0, 0, 0, 0, + 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, + 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 64, 0, -84, 0, + -112, 0, 0, 0, 11, 0, 10, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 32, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 11, 0, 3, 0, 3, 0, 0, 0, 9, 0, + 0, 0, 0, 0, 0, 0, 4, 4, 4, 6, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 1, 5, 0, -64, 127, 0, + -116, -65, 0, -1, -128, -109, 0, 0, 16, 0, 0, 8, 0, -109, + 0, 0, 0, 74, 2, 7, 64, -64, -128, 2, 2, 126, 127, 0, + -116, -65, 0, 0, -56, 125, 106, 36, -128, -66, 11, 0, -120, -65, + 0, 7, 65, -64, 4, 7, 2, -64, 0, 0, -62, -46, 0, 5, + 1, 0, 127, 0, -116, -65, 2, 0, 0, 74, 3, 2, 4, 126, + 2, 3, 2, 80, 4, 2, 4, 126, 0, 0, 112, -36, 0, 2, + 0, 0, 0, 0, -127, -65, 4, 0, 0, 0, 8, 0, 0, 0, + 1, 0, 0, 0, 65, 77, 68, 0, 1, 0, 0, 0, 0, 0, + 0, 0, 4, 0, 0, 0, 12, 0, 0, 0, 2, 0, 0, 0, + 65, 77, 68, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, + 1, 0, 4, 0, 0, 0, 25, 0, 0, 0, 5, 0, 0, 0, + 65, 77, 68, 0, 22, 0, 45, 104, 115, 97, 95, 99, 97, 108, + 108, 95, 99, 111, 110, 118, 101, 110, 116, 105, 111, 110, 61, 0, + 0, 0, 0, 0, 4, 0, 0, 0, 30, 0, 0, 0, 3, 0, + 0, 0, 65, 77, 68, 0, 4, 0, 7, 0, 7, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 65, 77, 68, 0, 65, 77, + 68, 71, 80, 85, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, + 8, 0, 0, 0, 4, 0, 0, 0, 65, 77, 68, 0, 48, 123, + 44, -103, -4, 127, 0, 0, 38, 95, 95, 102, 105, 108, 108, 95, + 109, 101, 109, 111, 114, 121, 95, 107, 101, 114, 110, 101, 108, 0, + 95, 95, 104, 115, 97, 95, 115, 101, 99, 116, 105, 111, 110, 46, + 104, 115, 97, 116, 101, 120, 116, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 26, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 96, 1, 0, 0, 0, 0, 0, 0, 22, 0, 0, 0, 3, 0, + 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 46, 104, 115, 97, 116, 101, 120, 116, 0, + 46, 110, 111, 116, 101, 0, 46, 115, 116, 114, 116, 97, 98, 0, + 46, 115, 121, 109, 116, 97, 98, 0, 46, 115, 104, 115, 116, 114, + 116, 97, 98, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 7, 0, + -64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 0, 0, 0, 0, 0, 0, 96, 1, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 0, + 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 96, 2, 0, 0, 0, 0, + 0, 0, -88, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 16, 0, 0, 0, 3, 0, 0, 0, + 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 8, 3, 0, 0, 0, 0, 0, 0, 44, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 24, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 56, 3, 0, 0, + 0, 0, 0, 0, 48, 0, 0, 0, 0, 0, 0, 0, 3, 0, + 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, + 24, 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 0, 3, 0, + 0, 0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 104, 3, 0, 0, 0, 0, 0, 0, 42, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, +}; + +extern char* const kFillMemoryKvObject = &kFillMemoryRawKv[0]; +extern size_t const kFillMemoryKvObjectSize = sizeof(kFillMemoryRawKv); +#endif // header guard \ No newline at end of file diff --git a/runtime/hsa-runtime/core/inc/amd_blit_kernel_vi.h b/runtime/hsa-runtime/core/inc/amd_blit_kernel_vi.h new file mode 100644 index 0000000000..ca03cd8dae --- /dev/null +++ b/runtime/hsa-runtime/core/inc/amd_blit_kernel_vi.h @@ -0,0 +1,490 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef HSA_RUNTIME_CORE_INC_AMD_BLIT_KERNEL_VI_H_ +#define HSA_RUNTIME_CORE_INC_AMD_BLIT_KERNEL_VI_H_ + +#include + +#define HSA_VECTOR_COPY_VI_AKC_SIZE 380 +#define HSA_VECTOR_COPY_VI_AKC_OFFSET 256 + +/*****HSAIL code of the ISA in ::kVectorCopyRawVi. +module &m:1:0:$full:$large:$default; + +prog kernel &__vector_copy_kernel( + kernarg_u64 %src, + kernarg_u64 %dst, + kernarg_u64 %size) +{ + @__vector_copy_kernel_entry: + // BB#0: // %entry + workitemabsid_u32 $s0, 0; + cvt_u64_u32 $d0, $s0; + ld_kernarg_align(8)_width(all)_u64 $d1, [%size]; + cmp_ge_b1_u64 $c0, $d0, $d1; + cbr_b1 $c0, @BB0_2; + // BB#1: // %if.end + ld_kernarg_align(8)_width(all)_u64 $d1, [%src]; + ld_kernarg_align(8)_width(all)_u64 $d2, [%dst]; + add_u64 $d2, $d2, $d0; + add_u64 $d0, $d1, $d0; + ld_global_u8 $s0, [$d0]; + st_global_u8 $s0, [$d2]; + + @BB0_2: + // %return + ret; +}; +*/ + +static char kVectorCopyRawVi[] = { + 127, 69, 76, 70, 2, 1, 1, 64, 0, 0, 0, 0, 0, + 0, 0, 0, 1, 0, -32, 0, 1, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 64, 0, 0, 0, 0, 0, 0, + 0, -72, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 64, 0, 56, 0, 1, 0, 64, 0, 6, 0, 5, 0, 3, + 0, 0, 96, 6, 0, 0, 0, 0, 1, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 124, 1, 0, 0, 0, 0, 0, 0, + 124, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, + 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, + 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, -63, 2, -84, 0, -112, 0, 0, 0, + 11, 0, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 96, 0, 5, 0, 5, 0, 0, 0, 9, 0, 0, + 0, 0, 0, 0, 0, 4, 4, 4, 6, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 2, 0, 2, -64, 4, 0, 0, 0, + 127, 0, -116, -65, 0, -1, -128, -110, 0, 0, 16, 0, 0, + 8, 0, -110, 0, 0, 0, 50, 3, 0, 6, -64, 16, 0, + 0, 0, -128, 2, 2, 126, 127, 0, -116, -65, 0, 0, -40, + 125, 106, 32, -128, -66, 16, 0, -120, -65, 3, 1, 10, -64, + 0, 0, 0, 0, 127, 0, -116, -65, 4, 0, 2, 50, 5, + 2, 4, 126, 2, 106, 28, -47, 2, 1, -87, 1, 0, 0, + 64, -36, 1, 0, 0, 1, 6, 0, 6, 50, 7, 2, 4, + 126, 4, 106, 28, -47, 2, 1, -87, 1, 112, 0, -116, -65, + 0, 0, 96, -36, 3, 1, 0, 0, 0, 0, -127, -65, 0, + 0, 0, 0, 4, 0, 0, 0, 8, 0, 0, 0, 1, 0, + 0, 0, 65, 77, 68, 0, 1, 0, 0, 0, 0, 0, 0, + 0, 4, 0, 0, 0, 12, 0, 0, 0, 2, 0, 0, 0, + 65, 77, 68, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, + 1, 1, 0, 4, 0, 0, 0, 25, 0, 0, 0, 5, 0, + 0, 0, 65, 77, 68, 0, 22, 0, 45, 104, 115, 97, 95, + 99, 97, 108, 108, 95, 99, 111, 110, 118, 101, 110, 116, 105, + 111, 110, 61, 0, 0, 0, 0, 0, 4, 0, 0, 0, 30, + 0, 0, 0, 3, 0, 0, 0, 65, 77, 68, 0, 4, 0, + 7, 0, 8, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, + 0, 65, 77, 68, 0, 65, 77, 68, 71, 80, 85, 0, 0, + 0, 0, 0, 0, 4, 0, 0, 0, 8, 0, 0, 0, 4, + 0, 0, 0, 65, 77, 68, 0, 32, 103, -72, 81, -3, 127, + 0, 0, 38, 95, 95, 118, 101, 99, 116, 111, 114, 95, 99, + 111, 112, 121, 95, 107, 101, 114, 110, 101, 108, 0, 95, 95, + 104, 115, 97, 95, 115, 101, 99, 116, 105, 111, 110, 46, 104, + 115, 97, 116, 101, 120, 116, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 26, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 124, 1, 0, 0, 0, 0, 0, 0, 22, 0, 0, 0, + 3, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 46, 104, 115, 97, 116, + 101, 120, 116, 0, 46, 110, 111, 116, 101, 0, 46, 115, 116, + 114, 116, 97, 98, 0, 46, 115, 121, 109, 116, 97, 98, 0, + 46, 115, 104, 115, 116, 114, 116, 97, 98, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 7, 0, -64, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 0, 0, 0, 0, 0, 0, 124, 1, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 10, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -128, + 2, 0, 0, 0, 0, 0, 0, -88, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 16, 0, 0, 0, 3, 0, 0, 0, 32, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 40, 3, + 0, 0, 0, 0, 0, 0, 44, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 24, + 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 88, 3, 0, + 0, 0, 0, 0, 0, 48, 0, 0, 0, 0, 0, 0, 0, + 3, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, + 0, 0, 0, 24, 0, 0, 0, 0, 0, 0, 0, 32, 0, + 0, 0, 3, 0, 0, 0, 32, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, -120, 3, 0, 0, + 0, 0, 0, 0, 42, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +}; +extern char* const kVectorCopyViObject = &kVectorCopyRawVi[0]; +extern size_t const kVectorCopyViObjectSize = sizeof(kVectorCopyRawVi); + +#define HSA_VECTOR_COPY_ALIGNED_VI_AKC_SIZE 452 +#define HSA_VECTOR_COPY_ALIGNED_VI_AKC_OFFSET 256 + +/*****HSAIL code of the ISA in ::kVectorCopyAlignedRawVi. +module &m:1:0:$full:$large:$default; +extension "amd:gcn"; + +prog kernel &__copy_buffer_aligned_kernel( + kernarg_u64 %src, + kernarg_u64 %dst, + kernarg_u64 %size, + kernarg_u32 %use_vector) +{ + @__copy_buffer_aligned_kernel_entry: + // BB#0: // %entry + workitemabsid_u32 $s0, 0; + cvt_u64_u32 $d0, $s0; + ld_kernarg_align(8)_width(all)_u64 $d1, [%size]; + cmp_ge_b1_u64 $c0, $d0, $d1; + cbr_b1 $c0, @LBB0_4; + // BB#1: // %if.end + ld_kernarg_align(8)_width(all)_u64 $d2, [%dst]; + ld_kernarg_align(8)_width(all)_u64 $d1, [%src]; + ld_kernarg_align(4)_width(all)_u32 $s0, [%use_vector]; + cmp_ne_b1_s32 $c0, $s0, 1; + cbr_b1 $c0, @LBB0_3; + // BB#2: // %if.then2 + shl_u64 $d0, $d0, 4; + add_u64 $d2, $d2, $d0; + add_u64 $d0, $d1, $d0; + ld_v4_global_align(16)_const_u32 ($s0, $s1, $s2, $s3), [$d0]; + st_v4_global_align(16)_u32 ($s0, $s1, $s2, $s3), [$d2]; + br @LBB0_4; + + @LBB0_3: + // %if.else + shl_u64 $d0, $d0, 2; + add_u64 $d2, $d2, $d0; + add_u64 $d0, $d1, $d0; + ld_global_align(4)_const_u32 $s0, [$d0]; + st_global_align(4)_u32 $s0, [$d2]; + + @LBB0_4: + // %if.end6 + ret; +}; +*/ + +static char kVectorCopyAlignedRawVi[] = { + 127, 69, 76, 70, 2, 1, 1, 64, 0, 0, 0, 0, 0, + 0, 0, 0, 1, 0, -32, 0, 1, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 64, 0, 0, 0, 0, 0, 0, + 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 64, 0, 56, 0, 1, 0, 64, 0, 6, 0, 5, 0, 3, + 0, 0, 96, 6, 0, 0, 0, 0, 1, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, -60, 1, 0, 0, 0, 0, 0, 0, + -60, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, + 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, + 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 65, 0, -84, 0, -112, 0, 0, 0, + 11, 0, 74, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 16, 0, 8, 0, 8, 0, 0, 0, 12, 0, 0, + 0, 0, 0, 0, 0, 4, 4, 4, 6, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 2, 0, 2, -64, 4, 0, 0, 0, + 127, 0, -116, -65, 0, -1, -128, -110, 0, 0, 16, 0, 0, + 8, 0, -110, 0, 0, 0, 50, 3, 0, 6, -64, 16, 0, + 0, 0, -128, 2, 2, 126, 127, 0, -116, -65, 0, 0, -40, + 125, 106, 32, -128, -66, 34, 0, -120, -65, -125, 0, 2, -64, + 24, 0, 0, 0, 3, 2, 10, -64, 0, 0, 0, 0, 127, + 0, -116, -65, 2, -127, 0, -65, 14, 0, -124, -65, 0, 0, + -113, -46, -124, 0, 2, 0, 8, 0, 4, 50, 9, 2, 6, + 126, 3, 3, 6, 56, 0, 0, 92, -36, 2, 0, 0, 4, + 10, 0, 0, 50, 11, 2, 4, 126, 2, 3, 2, 56, 112, + 0, -116, -65, 0, 0, 124, -36, 0, 4, 0, 0, 13, 0, + -126, -65, 0, 0, -113, -46, -126, 0, 2, 0, 8, 0, 4, + 50, 9, 2, 6, 126, 3, 3, 6, 56, 0, 0, 80, -36, + 2, 0, 0, 4, 10, 0, 0, 50, 11, 2, 4, 126, 2, + 3, 2, 56, 112, 0, -116, -65, 0, 0, 112, -36, 0, 4, + 0, 0, 0, 0, -127, -65, 0, 0, 0, 0, 4, 0, 0, + 0, 8, 0, 0, 0, 1, 0, 0, 0, 65, 77, 68, 0, + 1, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 12, + 0, 0, 0, 2, 0, 0, 0, 65, 77, 68, 0, 1, 0, + 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 4, 0, 0, + 0, 25, 0, 0, 0, 5, 0, 0, 0, 65, 77, 68, 0, + 22, 0, 45, 104, 115, 97, 95, 99, 97, 108, 108, 95, 99, + 111, 110, 118, 101, 110, 116, 105, 111, 110, 61, 0, 0, 0, + 0, 0, 4, 0, 0, 0, 30, 0, 0, 0, 3, 0, 0, + 0, 65, 77, 68, 0, 4, 0, 7, 0, 8, 0, 0, 0, + 0, 0, 0, 0, 1, 0, 0, 0, 65, 77, 68, 0, 65, + 77, 68, 71, 80, 85, 0, 0, 0, 0, 0, 0, 4, 0, + 0, 0, 8, 0, 0, 0, 4, 0, 0, 0, 65, 77, 68, + 0, 96, 62, -27, 85, -1, 127, 0, 0, 38, 95, 95, 99, + 111, 112, 121, 95, 98, 117, 102, 102, 101, 114, 95, 97, 108, + 105, 103, 110, 101, 100, 95, 107, 101, 114, 110, 101, 108, 0, + 95, 95, 104, 115, 97, 95, 115, 101, 99, 116, 105, 111, 110, + 46, 104, 115, 97, 116, 101, 120, 116, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 26, 0, 1, 0, 0, 0, 0, 0, 0, + 0, 0, 0, -60, 1, 0, 0, 0, 0, 0, 0, 30, 0, + 0, 0, 3, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 46, 104, 115, + 97, 116, 101, 120, 116, 0, 46, 110, 111, 116, 101, 0, 46, + 115, 116, 114, 116, 97, 98, 0, 46, 115, 121, 109, 116, 97, + 98, 0, 46, 115, 104, 115, 116, 114, 116, 97, 98, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 7, + 0, -64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, -60, 1, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 10, 0, 0, 0, 7, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, -56, 2, 0, 0, 0, 0, 0, 0, -88, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 16, 0, 0, 0, 3, 0, 0, 0, 32, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 112, 3, 0, 0, 0, 0, 0, 0, 52, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 24, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -88, + 3, 0, 0, 0, 0, 0, 0, 48, 0, 0, 0, 0, 0, + 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, + 0, 0, 0, 0, 0, 24, 0, 0, 0, 0, 0, 0, 0, + 32, 0, 0, 0, 3, 0, 0, 0, 32, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -40, 3, + 0, 0, 0, 0, 0, 0, 42, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +}; +extern char* const kVectorCopyAlignedViObject = &kVectorCopyAlignedRawVi[0]; +extern size_t const kVectorCopyAlignedViObjectSize = + sizeof(kVectorCopyAlignedRawVi); + +#define HSA_FILL_MEMORY_VI_AKC_SIZE 368 +#define HSA_FILL_MEMORY_VI_AKC_OFFSET 256 + +/*****HSAIL code of the ISA in ::kFillMemoryRawVi. +module &m:1:0:$full:$large:$default; +extension "amd:gcn"; + +prog kernel &__fill_memory_kernel( + kernarg_u64 %ptr, + kernarg_u64 %num, + kernarg_u32 %value) +{ + @__fill_memory_kernel_entry: + // BB#0: // %entry + workitemabsid_u32 $s0, 0; + cvt_u64_u32 $d0, $s0; + ld_kernarg_align(8)_width(all)_u64 $d1, [%num]; + cmp_ge_b1_u64 $c0, $d0, $d1; + cbr_b1 $c0, @LBB0_2; + // BB#1: // %if.end + ld_kernarg_align(8)_width(all)_u64 $d1, [%ptr]; + ld_kernarg_align(4)_width(all)_u32 $s0, [%value]; + shl_u64 $d0, $d0, 2; + add_u64 $d0, $d1, $d0; + st_global_align(4)_u32 $s0, [$d0]; + + @LBB0_2: + // %return + ret; +}; +*/ + +static char kFillMemoryRawVi[] = { + 127, 69, 76, 70, 2, 1, 1, 64, 0, 0, 0, 0, 0, + 0, 0, 0, 1, 0, -32, 0, 1, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 64, 0, 0, 0, 0, 0, 0, + 0, -88, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 64, 0, 56, 0, 1, 0, 64, 0, 6, 0, 5, 0, 3, + 0, 0, 96, 6, 0, 0, 0, 0, 1, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 112, 1, 0, 0, 0, 0, 0, 0, + 112, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, + 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, + 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 64, 0, -84, 0, -112, 0, 0, 0, + 11, 0, 74, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 13, 0, 3, 0, 3, 0, 0, 0, 9, 0, 0, + 0, 0, 0, 0, 0, 4, 4, 4, 6, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 2, 0, 2, -64, 4, 0, 0, 0, + 127, 0, -116, -65, 0, -1, -128, -110, 0, 0, 16, 0, 0, + 8, 0, -110, 0, 0, 0, 50, 3, 0, 6, -64, 8, 0, + 0, 0, -128, 2, 2, 126, 127, 0, -116, -65, 0, 0, -40, + 125, 106, 32, -128, -66, 13, 0, -120, -65, -125, 0, 6, -64, + 0, 0, 0, 0, 3, 1, 2, -64, 16, 0, 0, 0, 0, + 0, -113, -46, -126, 0, 2, 0, 127, 0, -116, -65, 2, 0, + 0, 50, 3, 2, 4, 126, 2, 3, 2, 56, 4, 2, 4, + 126, 0, 0, 112, -36, 0, 2, 0, 0, 0, 0, -127, -65, + 4, 0, 0, 0, 8, 0, 0, 0, 1, 0, 0, 0, 65, + 77, 68, 0, 1, 0, 0, 0, 0, 0, 0, 0, 4, 0, + 0, 0, 12, 0, 0, 0, 2, 0, 0, 0, 65, 77, 68, + 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, + 4, 0, 0, 0, 25, 0, 0, 0, 5, 0, 0, 0, 65, + 77, 68, 0, 22, 0, 45, 104, 115, 97, 95, 99, 97, 108, + 108, 95, 99, 111, 110, 118, 101, 110, 116, 105, 111, 110, 61, + 0, 0, 0, 0, 0, 4, 0, 0, 0, 30, 0, 0, 0, + 3, 0, 0, 0, 65, 77, 68, 0, 4, 0, 7, 0, 8, + 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 65, 77, + 68, 0, 65, 77, 68, 71, 80, 85, 0, 0, 0, 0, 0, + 0, 4, 0, 0, 0, 8, 0, 0, 0, 4, 0, 0, 0, + 65, 77, 68, 0, 16, -20, 88, 97, -4, 127, 0, 0, 38, + 95, 95, 102, 105, 108, 108, 95, 109, 101, 109, 111, 114, 121, + 95, 107, 101, 114, 110, 101, 108, 0, 95, 95, 104, 115, 97, + 95, 115, 101, 99, 116, 105, 111, 110, 46, 104, 115, 97, 116, + 101, 120, 116, 0, 0, 0, 0, 0, 0, 0, 0, 0, 26, + 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 112, 1, + 0, 0, 0, 0, 0, 0, 22, 0, 0, 0, 3, 0, 1, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 46, 104, 115, 97, 116, 101, 120, 116, + 0, 46, 110, 111, 116, 101, 0, 46, 115, 116, 114, 116, 97, + 98, 0, 46, 115, 121, 109, 116, 97, 98, 0, 46, 115, 104, + 115, 116, 114, 116, 97, 98, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, + 0, 0, 0, 1, 0, 0, 0, 7, 0, -64, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, + 0, 0, 0, 0, 0, 112, 1, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 0, + 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 112, 2, 0, 0, + 0, 0, 0, 0, -88, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16, 0, 0, + 0, 3, 0, 0, 0, 32, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 24, 3, 0, 0, 0, + 0, 0, 0, 44, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 24, 0, 0, 0, + 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 72, 3, 0, 0, 0, 0, + 0, 0, 48, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, + 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, + 24, 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 0, 3, + 0, 0, 0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 120, 3, 0, 0, 0, 0, 0, + 0, 42, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, +}; + +extern char* const kFillMemoryViObject = &kFillMemoryRawVi[0]; +extern size_t const kFillMemoryViObjectSize = sizeof(kFillMemoryRawVi); +#endif // header guard \ No newline at end of file diff --git a/runtime/hsa-runtime/core/inc/amd_blit_sdma.h b/runtime/hsa-runtime/core/inc/amd_blit_sdma.h new file mode 100644 index 0000000000..db851ea49b --- /dev/null +++ b/runtime/hsa-runtime/core/inc/amd_blit_sdma.h @@ -0,0 +1,218 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef HSA_RUNTIME_CORE_INC_AMD_BLIT_SDMA_H_ +#define HSA_RUNTIME_CORE_INC_AMD_BLIT_SDMA_H_ + +#include + +#include "hsakmt.h" + +#include "core/inc/blit.h" +#include "core/inc/runtime.h" +#include "core/inc/signal.h" +#include "core/util/utils.h" + +namespace amd { +class BlitSdma : public core::Blit { + public: + explicit BlitSdma(); + + virtual ~BlitSdma() override; + + /// @brief Initialize a User Mode SDMA Queue object. Input parameters specify + /// properties of queue being created. + /// + /// @param agent Pointer to the agent that will execute the PM4 commands. + /// + /// @return hsa_status_t + virtual hsa_status_t Initialize(const core::Agent& agent) override; + + /// @brief Marks the queue object as invalid and uncouples its link with + /// the underlying compute device's control block. Use of queue object + /// once it has been release is illegal and any behavior is indeterminate + /// + /// @note: The call will block until all packets have executed. + /// + /// @return hsa_status_t + virtual hsa_status_t Destroy() override; + + /// @brief Submit a linear copy command to the queue buffer. + /// + /// @param dst Memory address of the copy destination. + /// @param src Memory address of the copy source. + /// @param size Size of the data to be copied. + virtual hsa_status_t SubmitLinearCopyCommand(void* dst, const void* src, + size_t size) override; + + /// @brief Submit a linear copy command to the the underlying compute device's + /// control block. The call is non blocking. The memory transfer will start + /// after all dependent signals are satisfied. After the transfer is + /// completed, the out signal will be decremented. + /// + /// @param dst Memory address of the copy destination. + /// @param src Memory address of the copy source. + /// @param size Size of the data to be copied. + /// @param dep_signals Arrays of dependent signal. + /// @param out_signal Output signal. + virtual hsa_status_t SubmitLinearCopyCommand( + void* dst, const void* src, size_t size, + std::vector& dep_signals, + core::Signal& out_signal) override; + + /// @brief Submit a linear fill command to the queue buffer + /// + /// @param ptr Memory address of the fill destination. + /// @param value Value to be set. + /// @param count Number of uint32_t element to be set to the value. + virtual hsa_status_t SubmitLinearFillCommand(void* ptr, uint32_t value, + size_t count) override; + + protected: + /// @brief Acquires the address into queue buffer where a new command + /// packet of specified size could be written. The address that is + /// returned is guaranteed to be unique even in a multi-threaded access + /// scenario. This function is guaranteed to return a pointer for writing + /// data into the queue buffer. + /// + /// @param cmd_size Command packet size in bytes. + /// + /// @return pointer into the queue buffer where a PM4 packet of specified size + /// could be written. NULL if input size is greater than the size of queue + /// buffer. + char* AcquireWriteAddress(uint32_t cmd_size); + + void UpdateWriteAndDoorbellRegister(uint32_t current_offset, + uint32_t new_offset); + + /// @brief Updates the Write Register of compute device to the end of + /// SDMA packet written into queue buffer. The update to Write Register + /// will be safe under multi-threaded usage scenario. Furthermore, updates + /// to Write Register are blocking until all prior updates are completed + /// i.e. if two threads T1 & T2 were to call release, then updates by T2 + /// will block until T1 has completed its update (assumes T1 acquired the + /// write address first). + /// + /// @param cmd_addr pointer into the queue buffer where a PM4 packet was + /// written. + /// + /// @param cmd_size Command packet size in bytes. + void ReleaseWriteAddress(char* cmd_addr, uint32_t cmd_size); + + /// @brief Writes NO-OP words into queue buffer in case writing a command + /// causes the queue buffer to wrap. + /// + /// @param cmd_size Size in bytes of command causing queue buffer to wrap. + void WrapQueue(uint32_t cmd_size); + + /// @brief Build fence command + void BuildFenceCommand(char* fence_command_addr, uint32_t* fence, + uint32_t fence_value); + + uint32_t* ObtainFenceObject(); + + void WaitFence(uint32_t* fence, uint32_t fence_value); + + void BuildCopyCommand(char* cmd_addr, uint32_t num_copy_command, void* dst, + const void* src, size_t size); + + void BuildPollCommand(char* cmd_addr, void* addr, uint32_t reference); + + void BuildAtomicDecrementCommand(char* cmd_addr, void* addr); + + /// Indicates size of Queue buffer in bytes. + uint32_t queue_size_; + + /// Base address of the Queue buffer at construction time. + char* queue_start_addr_; + + uint32_t* fence_base_addr_; + uint32_t fence_pool_size_; + uint32_t fence_pool_mask_; + volatile uint32_t fence_pool_counter_; + + /// Queue resource descriptor for doorbell, read + /// and write indices + HsaQueueResource queue_resource_; + + /// @brief Current address of execution in Queue buffer. + /// + /// @note: The value of address is obtained by reading + /// the value of Write Register of the compute device. + /// Users should write to the Queue buffer at the current + /// address, else it will lead to execution error and potentially + /// a hang. + /// + /// @note: The value of Write Register does not always begin + /// with Zero after a Queue has been created. This needs to be + /// understood better. This means that current address number of + /// words of Queue buffer is unavailable for use. + volatile uint32_t cached_reserve_offset_; + volatile uint32_t cached_commit_offset_; + + uint32_t linear_copy_command_size_; + + uint32_t fill_command_size_; + + uint32_t fence_command_size_; + + uint32_t poll_command_size_; + + uint32_t atomic_command_size_; + + // Max copy size of a single linear copy command packet. + size_t max_single_linear_copy_size_; + + /// Max total copy size supported by the queue. + size_t max_total_linear_copy_size_; + + /// Max count of uint32_t of a single fill command packet. + size_t max_single_fill_size_; + + /// Max total fill count supported by the queue. + size_t max_total_fill_size_; + + std::mutex wrap_lock_; +}; +} // namespace amd + +#endif // header guard diff --git a/runtime/hsa-runtime/core/inc/amd_cpu_agent.h b/runtime/hsa-runtime/core/inc/amd_cpu_agent.h new file mode 100644 index 0000000000..1ad4ec0b72 --- /dev/null +++ b/runtime/hsa-runtime/core/inc/amd_cpu_agent.h @@ -0,0 +1,154 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +// AMD specific HSA backend. + +#ifndef HSA_RUNTIME_CORE_INC_AMD_CPU_AGENT_H_ +#define HSA_RUNTIME_CORE_INC_AMD_CPU_AGENT_H_ + +#include + +#include "hsakmt.h" + +#include "core/inc/runtime.h" +#include "core/inc/agent.h" +#include "core/inc/queue.h" + +namespace amd { +// @brief Class to represent a CPU device. +class CpuAgent : public core::Agent { + public: + // @brief CpuAgent constructor. + // + // @param [in] node Node id. Each CPU in different socket will get distinct + // id. + // @param [in] node_props Node property. + CpuAgent(HSAuint32 node, const HsaNodeProperties& node_props); + + // @brief CpuAgent destructor. + ~CpuAgent(); + + // @brief Invoke the user provided callback for each region accessible by + // this agent. + // + // @param [in] include_peer If true, the callback will be also invoked on each + // peer memory region accessible by this agent. If false, only invoke the + // callback on memory region owned by this agent. + // @param [in] callback User provided callback function. + // @param [in] data User provided pointer as input for @p callback. + // + // @retval ::HSA_STATUS_SUCCESS if the callback function for each traversed + // region returns ::HSA_STATUS_SUCCESS. + hsa_status_t VisitRegion(bool include_peer, + hsa_status_t (*callback)(hsa_region_t region, + void* data), + void* data) const; + + // @brief Override from core::Agent. + hsa_status_t IterateRegion(hsa_status_t (*callback)(hsa_region_t region, + void* data), + void* data) const override; + + // @brief Override from core::Agent. + hsa_status_t GetInfo(hsa_agent_info_t attribute, void* value) const override; + + // @brief Override from core::Agent. + hsa_status_t QueueCreate(size_t size, hsa_queue_type_t queue_type, + core::HsaEventCallback event_callback, void* data, + uint32_t private_segment_size, + uint32_t group_segment_size, + core::Queue** queue) override; + + // @brief Returns number of data caches. + __forceinline size_t num_cache() const { return cache_props_.size(); } + + // @brief Returns data cache property. + // + // @param [in] idx Cache level. + __forceinline const HsaCacheProperties& cache_prop(int idx) const { + return cache_props_[idx]; + } + + // @brief Override from core::Agent. + const std::vector& regions() const override { + return regions_; + } + + // @brief OVerride from core::Agent. + const core::Isa* isa() const override { return NULL; } + + private: + // @brief Query the driver to get the region list owned by this agent. + void InitRegionList(); + + // @brief Query the driver to get the cache properties. + void InitCacheList(); + + // @brief Invoke the user provided callback for every region in @p regions. + // + // @param [in] regions Array of region object. + // @param [in] callback User provided callback function. + // @param [in] data User provided pointer as input for @p callback. + // + // @retval ::HSA_STATUS_SUCCESS if the callback function for each traversed + // region returns ::HSA_STATUS_SUCCESS. + hsa_status_t VisitRegion( + const std::vector& regions, + hsa_status_t (*callback)(hsa_region_t region, void* data), + void* data) const; + + // @brief Node property. + const HsaNodeProperties properties_; + + // @brief Array of data cache property. The array index represents the cache + // level. + std::vector cache_props_; + + // @brief Array of regions owned by this agent. + std::vector regions_; + + DISALLOW_COPY_AND_ASSIGN(CpuAgent); +}; + +} // namespace amd + +#endif // header guard diff --git a/runtime/hsa-runtime/core/inc/amd_elf_image.hpp b/runtime/hsa-runtime/core/inc/amd_elf_image.hpp new file mode 100644 index 0000000000..99f8a1c3ae --- /dev/null +++ b/runtime/hsa-runtime/core/inc/amd_elf_image.hpp @@ -0,0 +1,222 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef AMD_ELF_IMAGE_HPP_ +#define AMD_ELF_IMAGE_HPP_ + +#include +#include +#include +#include + +namespace amd { + namespace elf { + class Symbol; + class SymbolTable; + class Section; + class RelocationSection; + + class Segment { + public: + virtual ~Segment() { } + virtual uint64_t type() const = 0; + virtual uint64_t memSize() const = 0; + virtual uint64_t align() const = 0; + virtual uint64_t imageSize() const = 0; + virtual uint64_t vaddr() const = 0; + virtual uint64_t flags() const = 0; + virtual const char* data() const = 0; + virtual uint16_t getSegmentIndex() = 0; + virtual bool updateAddSection(Section *section) = 0; + }; + + class Section { + public: + virtual ~Section() { } + virtual uint16_t getSectionIndex() const = 0; + virtual uint32_t type() const = 0; + virtual std::string Name() const = 0; + virtual uint64_t offset() const = 0; + virtual uint64_t addr() const = 0; + virtual bool updateAddr(uint64_t addr) = 0; + virtual uint64_t addralign() const = 0; + virtual uint64_t flags() const = 0; + virtual uint64_t size() const = 0; + virtual uint64_t nextDataOffset(uint64_t align) const = 0; + virtual uint64_t addData(const void *src, uint64_t size, uint64_t align) = 0; + virtual bool getData(uint64_t offset, void* dest, uint64_t size) = 0; + virtual Segment* segment() = 0; + virtual RelocationSection* asRelocationSection() = 0; + virtual bool hasRelocationSection() const = 0; + virtual RelocationSection* relocationSection(SymbolTable* symtab = 0) = 0; + virtual bool setMemSize(uint64_t s) = 0; + virtual uint64_t memSize() const = 0; + virtual bool setAlign(uint64_t a) = 0; + virtual uint64_t memAlign() const = 0; + }; + + class Relocation { + public: + virtual ~Relocation() { } + virtual RelocationSection* section() = 0; + virtual uint32_t type() = 0; + virtual uint32_t symbolIndex() = 0; + virtual Symbol* symbol() = 0; + virtual uint64_t offset() = 0; + virtual int64_t addend() = 0; + }; + + class RelocationSection : public virtual Section { + public: + virtual Relocation* addRelocation(uint32_t type, Symbol* symbol, uint64_t offset, int64_t addend) = 0; + virtual size_t relocationCount() const = 0; + virtual Relocation* relocation(size_t i) = 0; + virtual Section* targetSection() = 0; + }; + + class StringTable : public virtual Section { + public: + virtual const char* addString(const std::string& s) = 0; + virtual size_t addString1(const std::string& s) = 0; + virtual const char* getString(size_t ndx) = 0; + virtual size_t getStringIndex(const char* name) = 0; + }; + + class Symbol { + public: + virtual ~Symbol() { } + virtual uint32_t index() = 0; + virtual uint32_t type() = 0; + virtual uint32_t binding() = 0; + virtual uint64_t size() = 0; + virtual uint64_t value() = 0; + virtual unsigned char other() = 0; + virtual std::string name() = 0; + virtual Section* section() = 0; + virtual void setValue(uint64_t value) = 0; + virtual void setSize(uint64_t size) = 0; + }; + + class SymbolTable : public virtual Section { + public: + virtual Symbol* addSymbol(Section* section, const std::string& name, uint64_t value, uint64_t size, unsigned char type, unsigned char binding, unsigned char other = 0) = 0; + virtual size_t symbolCount() = 0; + virtual Symbol* symbol(size_t i) = 0; + }; + + class NoteSection : public virtual Section { + public: + virtual bool addNote(const std::string& name, uint32_t type, const void* desc = 0, uint32_t desc_size = 0) = 0; + virtual bool getNote(const std::string& name, uint32_t type, void** desc, uint32_t* desc_size) = 0; + }; + + class Image { + public: + virtual ~Image() { } + + virtual bool initNew(uint16_t machine, uint16_t type, uint8_t os_abi = 0, uint8_t abi_version = 0, uint32_t e_flags = 0) = 0; + virtual bool loadFromFile(const std::string& filename) = 0; + virtual bool saveToFile(const std::string& filename) = 0; + virtual bool initFromBuffer(const void* buffer, size_t size) = 0; + virtual bool initAsBuffer(const void* buffer, size_t size) = 0; + virtual bool writeTo(const std::string& filename) = 0; + virtual bool copyToBuffer(void** buf, size_t* size = 0) = 0; // Copy to new buffer allocated with malloc + virtual bool copyToBuffer(void* buf, size_t size) = 0; // Copy to existing buffer of given size. + + virtual const char* data() = 0; + virtual uint64_t size() = 0; + + virtual uint16_t Machine() = 0; + virtual uint16_t Type() = 0; + + std::string output() { return out.str(); } + + virtual bool Freeze() = 0; + virtual bool Validate() = 0; + + virtual StringTable* shstrtab() = 0; + virtual StringTable* strtab() = 0; + virtual SymbolTable* symtab() = 0; + virtual SymbolTable* getSymtab(uint16_t index) = 0; + + virtual StringTable* addStringTable(const std::string& name) = 0; + virtual StringTable* getStringTable(uint16_t index) = 0; + + virtual SymbolTable* addSymbolTable(const std::string& name, StringTable* stab = 0) = 0; + + virtual size_t segmentCount() = 0; + virtual Segment* segment(size_t i) = 0; + virtual Segment* segmentByVAddr(uint64_t vaddr) = 0; + + virtual size_t sectionCount() = 0; + virtual Section* section(size_t i) = 0; + virtual Section* sectionByVAddr(uint64_t vaddr) = 0; + + virtual NoteSection* note() = 0; + virtual NoteSection* addNoteSection(const std::string& name) = 0; + + virtual Segment* initSegment(uint32_t type, uint32_t flags, uint64_t paddr = 0) = 0; + virtual bool addSegments() = 0; + + virtual Section* addSection(const std::string &name, + uint32_t type, + uint64_t flags = 0, + uint64_t entsize = 0, + Segment* segment = 0) = 0; + + virtual RelocationSection* relocationSection(Section* sec, SymbolTable* symtab = 0) = 0; + + protected: + std::ostringstream out; + }; + + Image* NewElf32Image(); + Image* NewElf64Image(); + + uint64_t ElfSize(const void* buffer); + + std::string GetNoteString(uint32_t s_size, const char* s); + + } +} + +#endif // AMD_ELF_IMAGE_HPP_ diff --git a/runtime/hsa-runtime/core/inc/amd_gpu_agent.h b/runtime/hsa-runtime/core/inc/amd_gpu_agent.h new file mode 100644 index 0000000000..446e556f21 --- /dev/null +++ b/runtime/hsa-runtime/core/inc/amd_gpu_agent.h @@ -0,0 +1,354 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +// AMD specific HSA backend. + +#ifndef HSA_RUNTIME_CORE_INC_AMD_GPU_AGENT_H_ +#define HSA_RUNTIME_CORE_INC_AMD_GPU_AGENT_H_ + +#include + +#include "hsakmt.h" + +#include "core/inc/runtime.h" +#include "core/inc/agent.h" +#include "core/inc/blit.h" +#include "core/inc/signal.h" +#include "core/util/small_heap.h" +#include "core/util/locks.h" + +namespace amd { +// @brief Contains scratch memory information. +struct ScratchInfo { + void* queue_base; + size_t size; + size_t size_per_thread; + ptrdiff_t queue_process_offset; +}; + +// @brief Interface to represent a GPU agent. +class GpuAgentInt : public core::Agent { + public: + // @brief Constructor + GpuAgentInt(uint32_t node_id) + : core::Agent(node_id, core::Agent::DeviceType::kAmdGpuDevice) {} + + // @brief Invoke the user provided callback for each region accessible by + // this agent. + // + // @param [in] include_peer If true, the callback will be also invoked on each + // peer memory region accessible by this agent. If false, only invoke the + // callback on memory region owned by this agent. + // @param [in] callback User provided callback function. + // @param [in] data User provided pointer as input for @p callback. + // + // @retval ::HSA_STATUS_SUCCESS if the callback function for each traversed + // region returns ::HSA_STATUS_SUCCESS. + virtual hsa_status_t VisitRegion(bool include_peer, + hsa_status_t (*callback)(hsa_region_t region, + void* data), + void* data) const = 0; + + // @brief Carve scratch memory from scratch pool. + // + // @param [out] scratch Structure to be populated with the carved memory + // information. + virtual void AcquireQueueScratch(ScratchInfo& scratch) = 0; + + // @brief Release scratch memory back to scratch pool. + // + // @param [in] base Address of scratch memory previously acquired with + // call to ::AcquireQueueScratch. + virtual void ReleaseQueueScratch(void* base) = 0; + + // @brief Translate the kernel start and end dispatch timestamp from agent + // domain to host domain. + // + // @param [in] signal Pointer to signal that provides the dispatch timing. + // @param [out] time Structure to be populated with the host domain value. + virtual void TranslateTime(core::Signal* signal, + hsa_amd_profiling_dispatch_time_t& time) = 0; + + // @brief Translate timestamp agent domain to host domain. + // + // @param [out] time Timestamp in agent domain. + virtual uint64_t TranslateTime(uint64_t tick) = 0; + + // @brief Sets the coherency type of this agent. + // + // @param [in] type New coherency type. + // + // @retval true The new coherency type is set successfuly. + virtual bool current_coherency_type(hsa_amd_coherency_type_t type) = 0; + + // @brief Returns the current coherency type of this agent. + // + // @retval Coherency type. + virtual hsa_amd_coherency_type_t current_coherency_type() const = 0; + + // @brief Query if agent represent Kaveri GPU. + // + // @retval true if agent is Kaveri GPU. + virtual bool is_kv_device() const = 0; + + // @brief Query the agent HSA profile. + // + // @retval HSA profile. + virtual hsa_profile_t profile() const = 0; +}; + +class GpuAgent : public GpuAgentInt { + public: + // @brief GPU agent constructor. + // + // @param [in] node Node id. Each CPU in different socket will get distinct + // id. + // @param [in] node_props Node property. + GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props); + + // @brief GPU agent destructor. + ~GpuAgent(); + + // @brief Initialize DMA queue. + // + // @retval HSA_STATUS_SUCCESS DMA queue initialization is successful. + hsa_status_t InitDma(); + + uint16_t GetMicrocodeVersion() const; + + // @brief Assembles SP3 shader source into executable code. + // + // @param [in] src_sp3 SP3 shader source text representation. + // @param [in] func_name Name of the SP3 function to assemble. + // @param [out] code_buf Executable code buffer. + // @param [out] code_buf_size Size of executable code buffer in bytes. + void AssembleShader(const char* src_sp3, const char* func_name, + void*& code_buf, size_t& code_buf_size); + + // @brief Frees executable code created by AssembleShader. + // + // @param [in] code_buf Executable code buffer. + // @param [in] code_buf_size Size of executable code buffer in bytes. + void ReleaseShader(void* code_buf, size_t code_buf_size); + + // @brief Override from core::Agent. + hsa_status_t VisitRegion(bool include_peer, + hsa_status_t (*callback)(hsa_region_t region, + void* data), + void* data) const override; + + // @brief Override from core::Agent. + hsa_status_t IterateRegion(hsa_status_t (*callback)(hsa_region_t region, + void* data), + void* data) const override; + + // @brief Override from core::Agent. + hsa_status_t DmaCopy(void* dst, const void* src, size_t size) override; + + // @brief Override from core::Agent. + hsa_status_t DmaCopy(void* dst, const void* src, size_t size, + std::vector& dep_signals, + core::Signal& out_signal) override; + + // @brief Override from core::Agent. + hsa_status_t DmaFill(void* ptr, uint32_t value, size_t count) override; + + // @brief Override from core::Agent. + hsa_status_t GetInfo(hsa_agent_info_t attribute, void* value) const override; + + // @brief Override from core::Agent. + hsa_status_t QueueCreate(size_t size, hsa_queue_type_t queue_type, + core::HsaEventCallback event_callback, void* data, + uint32_t private_segment_size, + uint32_t group_segment_size, + core::Queue** queue) override; + + // @brief Override from amd::GpuAgentInt. + void AcquireQueueScratch(ScratchInfo& scratch) override; + + // @brief Override from amd::GpuAgentInt. + void ReleaseQueueScratch(void* base) override; + + // @brief Override from amd::GpuAgentInt. + void TranslateTime(core::Signal* signal, + hsa_amd_profiling_dispatch_time_t& time) override; + + // @brief Override from amd::GpuAgentInt. + uint64_t TranslateTime(uint64_t tick) override; + + // @brief Override from amd::GpuAgentInt. + bool current_coherency_type(hsa_amd_coherency_type_t type) override; + + // @brief Override from amd::GpuAgentInt. + hsa_amd_coherency_type_t current_coherency_type() const override { + return current_coherency_type_; + } + + // Getter & setters. + + // @brief Returns node property. + __forceinline const HsaNodeProperties& properties() const { + return properties_; + } + + // @brief Returns number of data caches. + __forceinline size_t num_cache() const { return cache_props_.size(); } + + // @brief Returns data cache property. + // + // @param [in] idx Cache level. + __forceinline const HsaCacheProperties& cache_prop(int idx) const { + return cache_props_[idx]; + } + + // @brief Override from core::Agent. + const std::vector& regions() const override { + return regions_; + } + + // @brief OVerride from core::Agent. + const core::Isa* isa() const override { return isa_; } + + // @brief Override from amd::GpuAgentInt. + __forceinline bool is_kv_device() const override { return is_kv_device_; } + + // @brief Override from amd::GpuAgentInt. + __forceinline hsa_profile_t profile() const override { return profile_; } + + protected: + static const uint32_t minAqlSize_ = 0x1000; // 4KB min + static const uint32_t maxAqlSize_ = 0x20000; // 8MB max + + // @brief Invoke the user provided callback for every region in @p regions. + // + // @param [in] regions Array of region object. + // @param [in] callback User provided callback function. + // @param [in] data User provided pointer as input for @p callback. + // + // @retval ::HSA_STATUS_SUCCESS if the callback function for each traversed + // region returns ::HSA_STATUS_SUCCESS. + hsa_status_t VisitRegion( + const std::vector& regions, + hsa_status_t (*callback)(hsa_region_t region, void* data), + void* data) const; + + // @brief Update ::t1_ tick count. + void SyncClocks(); + + // @brief Binds the second-level trap handler to this node. + void BindTrapHandler(); + + // @brief Node properties. + const HsaNodeProperties properties_; + + // @brief Current coherency type. + hsa_amd_coherency_type_t current_coherency_type_; + + // @brief Maximum number of queues that can be created. + uint32_t max_queues_; + + // @brief Object to manage scratch memory. + SmallHeap scratch_pool_; + + // @brief Default scratch size per queue. + size_t queue_scratch_len_; + + // @brief Default scratch size per work item. + size_t scratch_per_thread_; + + // @brief Blit object to handle memory copy/fill. + core::Blit* blit_; + + // @brief Mutex to protect the update to coherency type. + KernelMutex coherency_lock_; + + // @brief Mutex to protect access to scratch pool. + KernelMutex scratch_lock_; + + // @brief Mutex to protect access to ::t1_. + KernelMutex t1_lock_; + + // @brief GPU tick on initialization. + HsaClockCounters t0_; + + HsaClockCounters t1_; + + // @brief Array of GPU cache property. + std::vector cache_props_; + + // @brief Array of regions owned by this agent. + std::vector regions_; + + core::Isa* isa_; + + // @brief HSA profile. + hsa_profile_t profile_; + + bool is_kv_device_; + + void* trap_code_buf_; + + size_t trap_code_buf_size_; + + private: + // @brief Query the driver to get the region list owned by this agent. + void InitRegionList(); + + // @brief Reserve memory for scratch pool to be used by AQL queue of this + // agent. + void InitScratchPool(); + + // @brief Query the driver to get the cache properties. + void InitCacheList(); + + // @brief Alternative aperture base address. Only on KV. + uintptr_t ape1_base_; + + // @brief Alternative aperture size. Only on KV. + size_t ape1_size_; + + DISALLOW_COPY_AND_ASSIGN(GpuAgent); +}; + +} // namespace + +#endif // header guard diff --git a/runtime/hsa-runtime/core/inc/amd_hsa_code.hpp b/runtime/hsa-runtime/core/inc/amd_hsa_code.hpp new file mode 100644 index 0000000000..8431b5963c --- /dev/null +++ b/runtime/hsa-runtime/core/inc/amd_hsa_code.hpp @@ -0,0 +1,387 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef AMD_HSA_CODE_HPP_ +#define AMD_HSA_CODE_HPP_ + +#include "amd_elf_image.hpp" +#include "amd_hsa_elf.h" +#include "amd_hsa_kernel_code.h" +#include "hsa.h" +#include "hsa_ext_finalize.h" +#include +#include +#include +#include + +namespace amd { +namespace hsa { +namespace common { + +template +class Signed { +public: + static const uint64_t CT_SIGNATURE; + const uint64_t RT_SIGNATURE; + +protected: + Signed(): RT_SIGNATURE(signature) {} + virtual ~Signed() {} +}; + +template +const uint64_t Signed::CT_SIGNATURE = signature; + +bool IsAccessibleMemoryAddress(uint64_t address); + +template +size_t OffsetOf(member_type class_type::*member) +{ + return (char*)&((class_type*)nullptr->*member) - (char*)nullptr; +} + +template +class_type* ObjectAt(uint64_t address) +{ + if (!IsAccessibleMemoryAddress(address)) { + return nullptr; + } + + const uint64_t *rt_signature = + (const uint64_t*)(address + OffsetOf(&class_type::RT_SIGNATURE)); + if (nullptr == rt_signature) { + return nullptr; + } + if (class_type::CT_SIGNATURE != *rt_signature) { + return nullptr; + } + + return (class_type*)address; +} + +} + +namespace code { + + typedef amd::elf::Segment Segment; + typedef amd::elf::Section Section; + typedef amd::elf::RelocationSection RelocationSection; + typedef amd::elf::Relocation Relocation; + + class KernelSymbol; + class VariableSymbol; + + class Symbol { + protected: + amd::elf::Symbol* elfsym; + + public: + explicit Symbol(amd::elf::Symbol* elfsym_) + : elfsym(elfsym_) { } + virtual ~Symbol() { } + virtual bool IsKernelSymbol() const { return false; } + virtual KernelSymbol* AsKernelSymbol() { assert(false); return 0; } + virtual bool IsVariableSymbol() const { return false; } + virtual VariableSymbol* AsVariableSymbol() { assert(false); return 0; } + amd::elf::Symbol* elfSym() { return elfsym; } + std::string Name() const { return elfsym ? elfsym->name() : ""; } + Section* GetSection() { return elfsym->section(); } + virtual uint64_t SectionOffset() const { return elfsym->value(); } + virtual uint64_t VAddr() const { return elfsym->section()->addr() + elfsym->value(); } + uint32_t Index() const { return elfsym ? elfsym->index() : 0; } + bool IsDeclaration() const; + bool IsDefinition() const; + virtual bool IsAgent() const; + virtual hsa_symbol_kind_t Kind() const = 0; + hsa_symbol_linkage_t Linkage() const; + hsa_variable_allocation_t Allocation() const; + hsa_variable_segment_t Segment() const; + uint64_t Size() const; + uint32_t Size32() const; + uint32_t Alignment() const; + bool IsConst() const; + virtual hsa_status_t GetInfo(hsa_code_symbol_info_t attribute, void *value); + static hsa_code_symbol_t ToHandle(Symbol* sym); + static Symbol* FromHandle(hsa_code_symbol_t handle); + void setValue(uint64_t value) { elfsym->setValue(value); } + void setSize(uint32_t size) { elfsym->setSize(size); } + }; + + class KernelSymbol : public Symbol { + private: + uint32_t kernarg_segment_size, kernarg_segment_alignment; + uint32_t group_segment_size, private_segment_size; + bool is_dynamic_callstack; + + public: + explicit KernelSymbol(amd::elf::Symbol* elfsym_, const amd_kernel_code_t* akc); + bool IsKernelSymbol() const override { return true; } + KernelSymbol* AsKernelSymbol() override { return this; } + hsa_symbol_kind_t Kind() const override { return HSA_SYMBOL_KIND_KERNEL; } + hsa_status_t GetInfo(hsa_code_symbol_info_t attribute, void *value) override; + }; + + class VariableSymbol : public Symbol { + public: + explicit VariableSymbol(amd::elf::Symbol* elfsym_) + : Symbol(elfsym_) { } + bool IsVariableSymbol() const override { return true; } + VariableSymbol* AsVariableSymbol() override { return this; } + hsa_symbol_kind_t Kind() const override { return HSA_SYMBOL_KIND_VARIABLE; } + hsa_status_t GetInfo(hsa_code_symbol_info_t attribute, void *value) override; + }; + + class AmdHsaCode { + private: + std::ostringstream out; + std::unique_ptr img; + std::vector dataSegments; + std::vector dataSections; + std::vector relocationSections; + std::vector symbols; + bool combineDataSegments; + Segment* hsaSegments[AMDGPU_HSA_SEGMENT_LAST][2]; + Section* hsaSections[AMDGPU_HSA_SECTION_LAST]; + + amd::elf::Section* hsatext; + amd::elf::Section* imageInit; + amd::elf::Section* samplerInit; + amd::elf::Section* debugInfo; + amd::elf::Section* debugLine; + amd::elf::Section* debugAbbrev; + + bool PullElf(); + bool PullElfV1(); + bool PullElfV2(); + + void AddAmdNote(uint32_t type, const void* desc, uint32_t desc_size); + template + bool GetAmdNote(uint32_t type, S** desc) + { + uint32_t desc_size; + if (!img->note()->getNote("AMD", type, (void**) desc, &desc_size)) { + out << "Failed to find note, type: " << type << std::endl; + return false; + } + if (desc_size < sizeof(S)) { + out << "Note size mismatch, type: " << type << " size: " << desc_size << " expected at least " << sizeof(S) << std::endl; + return false; + } + return true; + } + + void PrintSegment(std::ostream& out, Segment* segment); + void PrintSection(std::ostream& out, Section* section); + void PrintRawData(std::ostream& out, Section* section); + void PrintRawData(std::ostream& out, const unsigned char *data, size_t size); + void PrintRelocationData(std::ostream& out, RelocationSection* section); + void PrintSymbol(std::ostream& out, Symbol* sym); + void PrintDisassembly(std::ostream& out, const unsigned char *isa, size_t size, uint32_t isa_offset = 0); + std::string MangleSymbolName(const std::string& module_name, const std::string symbol_name); + bool ElfImageError(); + + public: + bool HasHsaText() const { return hsatext != 0; } + amd::elf::Section* HsaText() { assert(hsatext); return hsatext; } + const amd::elf::Section* HsaText() const { assert(hsatext); return hsatext; } + amd::elf::SymbolTable* Symtab() { assert(img); return img->symtab(); } + uint16_t Machine() { return img->Machine(); } + + AmdHsaCode(bool combineDataSegments = true); + virtual ~AmdHsaCode(); + + std::string output() { return out.str(); } + bool LoadFromFile(const std::string& filename); + bool SaveToFile(const std::string& filename); + bool WriteToBuffer(void* buffer); + bool InitFromBuffer(const void* buffer, size_t size); + bool InitAsBuffer(const void* buffer, size_t size); + bool InitAsHandle(hsa_code_object_t code_handle); + bool InitNew(bool xnack = false); + bool Freeze(); + hsa_code_object_t GetHandle(); + const char* ElfData(); + uint64_t ElfSize(); + bool Validate(); + void Print(std::ostream& out); + void PrintNotes(std::ostream& out); + void PrintSegments(std::ostream& out); + void PrintSections(std::ostream& out); + void PrintSymbols(std::ostream& out); + void PrintMachineCode(std::ostream& out); + void PrintMachineCode(std::ostream& out, KernelSymbol* sym); + bool PrintToFile(const std::string& filename); + + void AddNoteCodeObjectVersion(uint32_t major, uint32_t minor); + bool GetNoteCodeObjectVersion(uint32_t* major, uint32_t* minor); + bool GetNoteCodeObjectVersion(std::string& version); + void AddNoteHsail(uint32_t hsail_major, uint32_t hsail_minor, hsa_profile_t profile, hsa_machine_model_t machine_model, hsa_default_float_rounding_mode_t rounding_mode); + bool GetNoteHsail(uint32_t* hsail_major, uint32_t* hsail_minor, hsa_profile_t* profile, hsa_machine_model_t* machine_model, hsa_default_float_rounding_mode_t* default_float_round); + void AddNoteIsa(const std::string& vendor_name, const std::string& architecture_name, uint32_t major, uint32_t minor, uint32_t stepping); + bool GetNoteIsa(std::string& vendor_name, std::string& architecture_name, uint32_t* major_version, uint32_t* minor_version, uint32_t* stepping); + bool GetNoteIsa(std::string& isaName); + void AddNoteProducer(uint32_t major, uint32_t minor, const std::string& producer); + bool GetNoteProducer(uint32_t* major, uint32_t* minor, std::string& producer_name); + void AddNoteProducerOptions(const std::string& options); + void AddNoteProducerOptions(int32_t call_convention, const hsa_ext_control_directives_t& user_directives, const std::string& user_options); + bool GetNoteProducerOptions(std::string& options); + + hsa_status_t GetInfo(hsa_code_object_info_t attribute, void *value); + hsa_status_t GetSymbol(const char *module_name, const char *symbol_name, hsa_code_symbol_t *sym); + hsa_status_t IterateSymbols(hsa_code_object_t code_object, + hsa_status_t (*callback)( + hsa_code_object_t code_object, + hsa_code_symbol_t symbol, + void* data), + void* data); + + void AddHsaTextData(const void* buffer, size_t size); + uint64_t NextKernelCodeOffset() const; + bool AddKernelCode(KernelSymbol* sym, const void* code, size_t size); + + Symbol* AddKernelDefinition(const std::string& name, const void* isa, size_t isa_size); + + size_t DataSegmentCount() { return dataSegments.size(); } + Segment* DataSegment(size_t i) { return dataSegments[i]; } + + size_t DataSectionCount() { return dataSections.size(); } + Section* DataSection(size_t i) { return dataSections[i]; } + + Section* AddEmptySection(); + Section* AddCodeSection(Segment* segment); + Section* AddDataSection(const std::string &name, + uint32_t type, + uint64_t flags, + Segment* segment); + + bool HasImageInitSection() const { return imageInit != 0; } + Section* ImageInitSection(); + void AddImageInitializer(Symbol* image, uint64_t destOffset, const amdgpu_hsa_image_descriptor_t& init); + void AddImageInitializer(Symbol* image, uint64_t destOffset, + amdgpu_hsa_metadata_kind16_t kind, + amdgpu_hsa_image_geometry8_t geometry, + amdgpu_hsa_image_channel_order8_t channel_order, amdgpu_hsa_image_channel_type8_t channel_type, + uint64_t width, uint64_t height, uint64_t depth, uint64_t array); + + + bool HasSamplerInitSection() const { return samplerInit != 0; } + amd::elf::Section* SamplerInitSection(); + amd::elf::Section* AddSamplerInit(); + void AddSamplerInitializer(Symbol* sampler, uint64_t destOffset, const amdgpu_hsa_sampler_descriptor_t& init); + void AddSamplerInitializer(Symbol* sampler, uint64_t destOffset, + amdgpu_hsa_sampler_coord8_t coord, + amdgpu_hsa_sampler_filter8_t filter, + amdgpu_hsa_sampler_addressing8_t addressing); + + void AddInitVarWithAddress(bool large, Symbol* dest, uint64_t destOffset, Symbol* addrOf, uint64_t addrAddend); + + void InitHsaSegment(amdgpu_hsa_elf_segment_t segment, bool writable); + bool AddHsaSegments(); + Segment* HsaSegment(amdgpu_hsa_elf_segment_t segment, bool writable); + + void InitHsaSectionSegment(amdgpu_hsa_elf_section_t section, bool combineSegments = true); + Section* HsaDataSection(amdgpu_hsa_elf_section_t section, bool combineSegments = true); + + Symbol* AddExecutableSymbol(const std::string &name, + unsigned char type, + unsigned char binding, + unsigned char other, + Section *section = 0); + + Symbol* AddVariableSymbol(const std::string &name, + unsigned char type, + unsigned char binding, + unsigned char other, + Section *section, + uint64_t value, + uint64_t size); + void AddSectionSymbols(); + + size_t RelocationSectionCount() { return relocationSections.size(); } + RelocationSection* GetRelocationSection(size_t i) { return relocationSections[i]; } + + size_t SymbolCount() { return symbols.size(); } + Symbol* GetSymbol(size_t i) { return symbols[i]; } + Symbol* GetSymbolByElfIndex(size_t index); + Symbol* FindSymbol(const std::string &n); + + void AddData(amdgpu_hsa_elf_section_t section, const void* data = 0, size_t size = 0); + + Section* DebugInfo(); + Section* DebugLine(); + Section* DebugAbbrev(); + + Section* AddHsaHlDebug(const std::string& name, const void* data, size_t size); + }; + + class AmdHsaCodeManager { + private: + typedef std::unordered_map CodeMap; + CodeMap codeMap; + + public: + AmdHsaCode* FromHandle(hsa_code_object_t handle); + bool Destroy(hsa_code_object_t handle); + }; + + class KernelSymbolV2 : public KernelSymbol { + private: + public: + explicit KernelSymbolV2(amd::elf::Symbol* elfsym_, const amd_kernel_code_t* akc); + bool IsAgent() const override { return true; } + uint64_t SectionOffset() const override { return elfsym->value() - elfsym->section()->addr(); } + uint64_t VAddr() const override { return elfsym->value(); } + }; + + class VariableSymbolV2 : public VariableSymbol { + private: + public: + explicit VariableSymbolV2(amd::elf::Symbol* elfsym_) : VariableSymbol(elfsym_) { } + bool IsAgent() const override { return false; } + uint64_t SectionOffset() const override { return elfsym->value() - elfsym->section()->addr(); } + uint64_t VAddr() const override { return elfsym->value(); } + }; +} +} +} + +#endif // AMD_HSA_CODE_HPP_ diff --git a/runtime/hsa-runtime/core/inc/amd_hsa_loader.hpp b/runtime/hsa-runtime/core/inc/amd_hsa_loader.hpp new file mode 100644 index 0000000000..8e29df1496 --- /dev/null +++ b/runtime/hsa-runtime/core/inc/amd_hsa_loader.hpp @@ -0,0 +1,358 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef AMD_HSA_LOADER_HPP +#define AMD_HSA_LOADER_HPP + +#include +#include +#include "hsa.h" +#include "hsa_ext_image.h" +#include "amd_hsa_elf.h" +#include "amd_load_map.h" +#include +#include +#include + +/// @brief Major version of the AMD HSA Loader. Major versions are not backwards +/// compatible. +#define AMD_HSA_LOADER_VERSION_MAJOR 0 + +/// @brief Minor version of the AMD HSA Loader. Minor versions are backwards +/// compatible. +#define AMD_HSA_LOADER_VERSION_MINOR 5 + +/// @brief Descriptive version of the AMD HSA Loader. +#define AMD_HSA_LOADER_VERSION "AMD HSA Loader v0.05 (June 16, 2015)" + +enum hsa_ext_symbol_info_t { + HSA_EXT_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT_SIZE = 100, + HSA_EXT_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT_ALIGN = 101, +}; + +typedef uint32_t hsa_symbol_info32_t; +typedef hsa_executable_symbol_t hsa_symbol_t; +typedef hsa_executable_symbol_info_t hsa_symbol_info_t; + +namespace amd { +namespace hsa { +namespace loader { + +//===----------------------------------------------------------------------===// +// Context. // +//===----------------------------------------------------------------------===// + +class Context { +public: + virtual ~Context() {} + + virtual hsa_isa_t IsaFromName(const char *name) = 0; + + virtual bool IsaSupportedByAgent(hsa_agent_t agent, hsa_isa_t isa) = 0; + + virtual void* SegmentAlloc(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, size_t size, size_t align, bool zero) = 0; + + virtual bool SegmentCopy(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, void* dst, size_t offset, const void* src, size_t size) = 0; + + virtual void SegmentFree(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, void* seg, size_t size) = 0; + + virtual void* SegmentAddress(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, void* seg, size_t offset) = 0; + + virtual void* SegmentHostAddress(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, void* seg, size_t offset) = 0; + + virtual bool SegmentFreeze(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, void* seg, size_t size) = 0; + + virtual bool ImageExtensionSupported() = 0; + + virtual hsa_status_t ImageCreate( + hsa_agent_t agent, + hsa_access_permission_t image_permission, + const hsa_ext_image_descriptor_t *image_descriptor, + const void *image_data, + hsa_ext_image_t *image_handle) = 0; + + virtual hsa_status_t ImageDestroy( + hsa_agent_t agent, hsa_ext_image_t image_handle) = 0; + + virtual hsa_status_t SamplerCreate( + hsa_agent_t agent, + const hsa_ext_sampler_descriptor_t *sampler_descriptor, + hsa_ext_sampler_t *sampler_handle) = 0; + + virtual hsa_status_t SamplerDestroy( + hsa_agent_t agent, hsa_ext_sampler_t sampler_handle) = 0; + +protected: + Context() {} + +private: + Context(const Context &c); + Context& operator=(const Context &c); +}; + +//===----------------------------------------------------------------------===// +// Symbol. // +//===----------------------------------------------------------------------===// + +class Symbol { +public: + static hsa_symbol_t Handle(Symbol *symbol) { + hsa_symbol_t symbol_handle = + {reinterpret_cast(symbol)}; + return symbol_handle; + } + + static Symbol* Object(hsa_symbol_t symbol_handle) { + Symbol *symbol = + reinterpret_cast(symbol_handle.handle); + return symbol; + } + + virtual ~Symbol() {} + + virtual bool GetInfo(hsa_symbol_info32_t symbol_info, void *value) = 0; + +protected: + Symbol() {} + +private: + Symbol(const Symbol &s); + Symbol& operator=(const Symbol &s); +}; + +//===----------------------------------------------------------------------===// +// LoadedCodeObject. // +//===----------------------------------------------------------------------===// + +class LoadedCodeObject { +public: + static amd_loaded_code_object_t Handle(LoadedCodeObject *object) { + amd_loaded_code_object_t handle = + {reinterpret_cast(object)}; + return handle; + } + + static LoadedCodeObject* Object(amd_loaded_code_object_t handle) { + LoadedCodeObject *object = + reinterpret_cast(handle.handle); + return object; + } + + virtual ~LoadedCodeObject() {} + + virtual bool GetInfo(amd_loaded_code_object_info_t attribute, void *value) = 0; + + virtual hsa_status_t IterateLoadedSegments( + hsa_status_t (*callback)( + amd_loaded_segment_t loaded_segment, + void *data), + void *data) = 0; + +protected: + LoadedCodeObject() {} + +private: + LoadedCodeObject(const LoadedCodeObject&); + LoadedCodeObject& operator=(const LoadedCodeObject&); +}; + +//===----------------------------------------------------------------------===// +// LoadedSegment. // +//===----------------------------------------------------------------------===// + +class LoadedSegment { +public: + static amd_loaded_segment_t Handle(LoadedSegment *object) { + amd_loaded_segment_t handle = + {reinterpret_cast(object)}; + return handle; + } + + static LoadedSegment* Object(amd_loaded_segment_t handle) { + LoadedSegment *object = + reinterpret_cast(handle.handle); + return object; + } + + virtual ~LoadedSegment() {} + + virtual bool GetInfo(amd_loaded_segment_info_t attribute, void *value) = 0; + +protected: + LoadedSegment() {} + +private: + LoadedSegment(const LoadedSegment&); + LoadedSegment& operator=(const LoadedSegment&); +}; + +//===----------------------------------------------------------------------===// +// Executable. // +//===----------------------------------------------------------------------===// + +class Executable { +public: + static hsa_executable_t Handle(Executable *executable) { + hsa_executable_t executable_handle = + {reinterpret_cast(executable)}; + return executable_handle; + } + + static Executable* Object(hsa_executable_t executable_handle) { + Executable *executable = + reinterpret_cast(executable_handle.handle); + return executable; + } + + virtual ~Executable() {} + + virtual hsa_status_t GetInfo( + hsa_executable_info_t executable_info, void *value) = 0; + + virtual hsa_status_t DefineProgramExternalVariable( + const char *name, void *address) = 0; + + virtual hsa_status_t DefineAgentExternalVariable( + const char *name, + hsa_agent_t agent, + hsa_variable_segment_t segment, + void *address) = 0; + + virtual hsa_status_t LoadCodeObject( + hsa_agent_t agent, + hsa_code_object_t code_object, + const char *options, + amd_loaded_code_object_t *loaded_code_object = nullptr) = 0; + + virtual hsa_status_t LoadCodeObject( + hsa_agent_t agent, + hsa_code_object_t code_object, + size_t code_object_size, + const char *options, + amd_loaded_code_object_t *loaded_code_object = nullptr) = 0; + + virtual hsa_status_t Freeze(const char *options) = 0; + + virtual hsa_status_t Validate(uint32_t *result) = 0; + + virtual Symbol* GetSymbol( + const char *module_name, + const char *symbol_name, + hsa_agent_t agent, + int32_t call_convention) = 0; + + typedef hsa_status_t (*iterate_symbols_f)( + hsa_executable_t executable, + hsa_symbol_t symbol_handle, + void *data); + + virtual hsa_status_t IterateSymbols( + iterate_symbols_f callback, void *data) = 0; + + virtual hsa_status_t IterateLoadedCodeObjects( + hsa_status_t (*callback)( + amd_loaded_code_object_t loaded_code_object, + void *data), + void *data) = 0; + +protected: + Executable() {} + +private: + Executable(const Executable &e); + Executable& operator=(const Executable &e); + + static std::vector executables; + static std::mutex executables_mutex; +}; + +/// @class Loader +class Loader { +public: + /// @brief Destructor. + virtual ~Loader() {} + + /// @brief Creates AMD HSA Loader with specified @p context. + /// + /// @param[in] context Context. Must not be null. + /// + /// @returns AMD HSA Loader on success, null on failure. + static Loader* Create(Context* context); + + /// @brief Destroys AMD HSA Loader @p Loader_object. + /// + /// @param[in] loader AMD HSA Loader to destroy. Must not be null. + static void Destroy(Loader *loader); + + /// @returns Context associated with Loader. + virtual Context* GetContext() const = 0; + + /// @brief Creates empty AMD HSA Executable with specified @p profile, + /// @p options + virtual Executable* CreateExecutable(hsa_profile_t profile, const char *options) = 0; + + virtual void DestroyExecutable(Executable *executable) = 0; + + virtual hsa_status_t IterateExecutables( + hsa_status_t (*callback)( + hsa_executable_t executable, + void *data), + void *data) = 0; + +protected: + /// @brief Default constructor. + Loader() {} + +private: + /// @brief Copy constructor - not available. + Loader(const Loader&); + + /// @brief Assignment operator - not available. + Loader& operator=(const Loader&); +}; + + +} // namespace loader +} // namespace hsa +} // namespace amd + +#endif // AMD_HSA_LOADER_HPP diff --git a/runtime/hsa-runtime/core/inc/amd_load_map.h b/runtime/hsa-runtime/core/inc/amd_load_map.h new file mode 100644 index 0000000000..bd3f78c82d --- /dev/null +++ b/runtime/hsa-runtime/core/inc/amd_load_map.h @@ -0,0 +1,174 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef AMD_LOAD_MAP_H +#define AMD_LOAD_MAP_H + +#include "hsa.h" + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +/// @todo. +enum { + AMD_EXTENSION_LOAD_MAP = 0x1002 +}; + +/// @todo. +typedef struct amd_loaded_code_object_s { + uint64_t handle; +} amd_loaded_code_object_t; + +/// @todo. +enum amd_loaded_code_object_info_t { + AMD_LOADED_CODE_OBJECT_INFO_ELF_IMAGE = 0, + AMD_LOADED_CODE_OBJECT_INFO_ELF_IMAGE_SIZE = 1 +}; + +/// @todo. +typedef struct amd_loaded_segment_s { + uint64_t handle; +} amd_loaded_segment_t; + +/// @todo. +enum amd_loaded_segment_info_t { + AMD_LOADED_SEGMENT_INFO_TYPE = 0, + AMD_LOADED_SEGMENT_INFO_ELF_BASE_ADDRESS = 1, + AMD_LOADED_SEGMENT_INFO_LOAD_BASE_ADDRESS = 2, + AMD_LOADED_SEGMENT_INFO_SIZE = 3 +}; + +/// @todo. +hsa_status_t amd_executable_load_code_object( + hsa_executable_t executable, + hsa_agent_t agent, + hsa_code_object_t code_object, + const char *options, + amd_loaded_code_object_t *loaded_code_object); + +/// @brief Invokes @p callback for each available executable in current +/// process. +hsa_status_t amd_iterate_executables( + hsa_status_t (*callback)( + hsa_executable_t executable, + void *data), + void *data); + +/// @brief Invokes @p callback for each loaded code object in specified +/// @p executable. +hsa_status_t amd_executable_iterate_loaded_code_objects( + hsa_executable_t executable, + hsa_status_t (*callback)( + amd_loaded_code_object_t loaded_code_object, + void *data), + void *data); + +/// @brief Retrieves current value of specified @p loaded_code_object's +/// @p attribute. +hsa_status_t amd_loaded_code_object_get_info( + amd_loaded_code_object_t loaded_code_object, + amd_loaded_code_object_info_t attribute, + void *value); + +/// @brief Invokes @p callback for each loaded segment in specified +/// @p loaded_code_object. +hsa_status_t amd_loaded_code_object_iterate_loaded_segments( + amd_loaded_code_object_t loaded_code_object, + hsa_status_t (*callback)( + amd_loaded_segment_t loaded_segment, + void *data), + void *data); + +/// @brief Retrieves current value of specified @p loaded_segment's +/// @p attribute. +hsa_status_t amd_loaded_segment_get_info( + amd_loaded_segment_t loaded_segment, + amd_loaded_segment_info_t attribute, + void *value); + +#define amd_load_map_1_00 + +typedef struct amd_load_map_1_00_pfn_s { + hsa_status_t (*amd_executable_load_code_object)( + hsa_executable_t executable, + hsa_agent_t agent, + hsa_code_object_t code_object, + const char *options, + amd_loaded_code_object_t *loaded_code_object); + + hsa_status_t (*amd_iterate_executables)( + hsa_status_t (*callback)( + hsa_executable_t executable, + void *data), + void *data); + + hsa_status_t (*amd_executable_iterate_loaded_code_objects)( + hsa_executable_t executable, + hsa_status_t (*callback)( + amd_loaded_code_object_t loaded_code_object, + void *data), + void *data); + + hsa_status_t (*amd_loaded_code_object_get_info)( + amd_loaded_code_object_t loaded_code_object, + amd_loaded_code_object_info_t attribute, + void *value); + + hsa_status_t (*amd_loaded_code_object_iterate_loaded_segments)( + amd_loaded_code_object_t loaded_code_object, + hsa_status_t (*callback)( + amd_loaded_segment_t loaded_segment, + void *data), + void *data); + + hsa_status_t (*amd_loaded_segment_get_info)( + amd_loaded_segment_t loaded_segment, + amd_loaded_segment_info_t attribute, + void *value); +} amd_load_map_1_00_pfn_t; + +#ifdef __cplusplus +} // extern "C" +#endif // __cplusplus + +#endif // AMD_LOAD_MAP_H diff --git a/runtime/hsa-runtime/core/inc/amd_loader_context.hpp b/runtime/hsa-runtime/core/inc/amd_loader_context.hpp new file mode 100644 index 0000000000..19a6a5cfd9 --- /dev/null +++ b/runtime/hsa-runtime/core/inc/amd_loader_context.hpp @@ -0,0 +1,97 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef HSA_RUNTIME_CORE_INC_AMD_LOADER_CONTEXT_HPP +#define HSA_RUNTIME_CORE_INC_AMD_LOADER_CONTEXT_HPP + +#include "core/inc/amd_hsa_loader.hpp" + +namespace amd { + +class LoaderContext final: public hsa::loader::Context { +public: + LoaderContext(): hsa::loader::Context() {} + + ~LoaderContext() {} + + hsa_isa_t IsaFromName(const char *name) override; + + bool IsaSupportedByAgent(hsa_agent_t agent, hsa_isa_t code_object_isa) override; + + void* SegmentAlloc(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, size_t size, size_t align, bool zero) override; + + bool SegmentCopy(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, void* dst, size_t offset, const void* src, size_t size) override; + + void SegmentFree(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, void* seg, size_t size = 0) override; + + void* SegmentAddress(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, void* seg, size_t offset) override; + + void* SegmentHostAddress(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, void* seg, size_t offset) override; + + bool SegmentFreeze(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, void* seg, size_t size) override; + + bool ImageExtensionSupported(); + + hsa_status_t ImageCreate( + hsa_agent_t agent, + hsa_access_permission_t image_permission, + const hsa_ext_image_descriptor_t *image_descriptor, + const void *image_data, + hsa_ext_image_t *image_handle); + + hsa_status_t ImageDestroy(hsa_agent_t agent, hsa_ext_image_t image_handle); + + hsa_status_t SamplerCreate( + hsa_agent_t agent, + const hsa_ext_sampler_descriptor_t *sampler_descriptor, + hsa_ext_sampler_t *sampler_handle); + + hsa_status_t SamplerDestroy(hsa_agent_t agent, hsa_ext_sampler_t sampler_handle); + +private: + LoaderContext(const LoaderContext&); + LoaderContext& operator=(const LoaderContext&); +}; + +} // namespace amd + +#endif // HSA_RUNTIME_CORE_INC_AMD_LOADER_CONTEXT_HPP diff --git a/runtime/hsa-runtime/core/inc/amd_memory_region.h b/runtime/hsa-runtime/core/inc/amd_memory_region.h new file mode 100644 index 0000000000..fb3a6531e4 --- /dev/null +++ b/runtime/hsa-runtime/core/inc/amd_memory_region.h @@ -0,0 +1,191 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +// AMD specific HSA backend. + +#ifndef HSA_RUNTIME_CORE_INC_AMD_MEMORY_REGION_H_ +#define HSA_RUNTIME_CORE_INC_AMD_MEMORY_REGION_H_ + +#include "hsakmt.h" + +#include "core/inc/agent.h" +#include "core/inc/memory_region.h" + +#include "inc/hsa_ext_amd.h" + +namespace amd { +class MemoryRegion : public core::MemoryRegion { + public: + /// @brief Convert this object into hsa_region_t. + static __forceinline hsa_region_t Convert(MemoryRegion* region) { + const hsa_region_t region_handle = { + static_cast(reinterpret_cast(region))}; + return region_handle; + } + + static __forceinline const hsa_region_t Convert(const MemoryRegion* region) { + const hsa_region_t region_handle = { + static_cast(reinterpret_cast(region))}; + return region_handle; + } + + /// @brief Convert hsa_region_t into amd::MemoryRegion *. + static __forceinline MemoryRegion* Convert(hsa_region_t region) { + return reinterpret_cast(region.handle); + } + + /// @brief Allocate agent accessible memory (system / local memory). + static void* AllocateKfdMemory(const HsaMemFlags& flag, HSAuint32 node_id, + size_t size); + + /// @brief Free agent accessible memory (system / local memory). + static void FreeKfdMemory(void* ptr, size_t size); + + static bool RegisterMemory(void* ptr, size_t size, size_t num_nodes, + const uint32_t* nodes); + + static void DeregisterMemory(void* ptr); + + /// @brief Pin memory. + static bool MakeKfdMemoryResident(size_t num_node, const uint32_t* nodes, + void* ptr, size_t size, + uint64_t* alternate_va, + HsaMemMapFlags map_flag); + + /// @brief Unpin memory. + static void MakeKfdMemoryUnresident(void* ptr); + + MemoryRegion(bool fine_grain, bool full_profile, core::Agent* owner, + const HsaMemoryProperties& mem_props); + + ~MemoryRegion(); + + hsa_status_t Allocate(size_t size, void** address) const; + + hsa_status_t Allocate(bool restrict_access, size_t size, + void** address) const; + + hsa_status_t Free(void* address, size_t size) const; + + hsa_status_t GetInfo(hsa_region_info_t attribute, void* value) const; + + hsa_status_t GetPoolInfo(hsa_amd_memory_pool_info_t attribute, + void* value) const; + + hsa_status_t GetAgentPoolInfo(const core::Agent& agent, + hsa_amd_agent_memory_pool_info_t attribute, + void* value) const; + + hsa_status_t AllowAccess(uint32_t num_agents, const hsa_agent_t* agents, + const void* ptr, size_t size) const; + + hsa_status_t CanMigrate(const MemoryRegion& dst, bool& result) const; + + hsa_status_t Migrate(uint32_t flag, const void* ptr) const; + + hsa_status_t Lock(uint32_t num_agents, const hsa_agent_t* agents, + void* host_ptr, size_t size, void** agent_ptr) const; + + hsa_status_t Unlock(void* host_ptr) const; + + HSAuint64 GetBaseAddress() const { return mem_props_.VirtualBaseAddress; } + + HSAuint64 GetPhysicalSize() const { return mem_props_.SizeInBytes; } + + HSAuint64 GetVirtualSize() const { return virtual_size_; } + + hsa_status_t AssignAgent(void* ptr, size_t size, const core::Agent& agent, + hsa_access_permission_t access) const; + + __forceinline bool IsLocalMemory() const { + return ((mem_props_.HeapType == HSA_HEAPTYPE_FRAME_BUFFER_PRIVATE) || + (mem_props_.HeapType == HSA_HEAPTYPE_FRAME_BUFFER_PUBLIC)); + } + + __forceinline bool IsPublic() const { + return (mem_props_.HeapType == HSA_HEAPTYPE_FRAME_BUFFER_PUBLIC); + } + + __forceinline bool IsSystem() const { + return mem_props_.HeapType == HSA_HEAPTYPE_SYSTEM; + } + + __forceinline bool IsLDS() const { + return mem_props_.HeapType == HSA_HEAPTYPE_GPU_LDS; + } + + __forceinline bool IsGDS() const { + return mem_props_.HeapType == HSA_HEAPTYPE_GPU_GDS; + } + + __forceinline bool IsScratch() const { + return mem_props_.HeapType == HSA_HEAPTYPE_GPU_SCRATCH; + } + + __forceinline bool IsSvm() const { + return mem_props_.HeapType == HSA_HEAPTYPE_DEVICE_SVM; + } + + __forceinline uint32_t BusWidth() const { + return static_cast(mem_props_.Width); + } + + __forceinline uint32_t MaxMemCloc() const { + return static_cast(mem_props_.MemoryClockMax); + } + + private: + const HsaMemoryProperties mem_props_; + + HsaMemFlags mem_flag_; + + HsaMemMapFlags map_flag_; + + size_t max_single_alloc_size_; + + HSAuint64 virtual_size_; + + static const size_t kPageSize_ = 4096; +}; +} // namespace + +#endif // header guard diff --git a/runtime/hsa-runtime/core/inc/amd_topology.h b/runtime/hsa-runtime/core/inc/amd_topology.h new file mode 100644 index 0000000000..8e62679d14 --- /dev/null +++ b/runtime/hsa-runtime/core/inc/amd_topology.h @@ -0,0 +1,56 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef HSA_RUNTIME_CORE_INC_AMD_TOPOLOGY_H_ +#define HSA_RUNTIME_CORE_INC_AMD_TOPOLOGY_H_ + +namespace amd { +/// @brief Initializes the runtime. +/// Should not be called directly, must be called only from Runtime::Acquire() +bool Load(); + +/// @brief Shutdown/cleanup of runtime. +/// Should not be called directly, must be called only from Runtime::Release() +bool Unload(); +} // namespace + +#endif // header guard diff --git a/runtime/hsa-runtime/core/inc/blit.h b/runtime/hsa-runtime/core/inc/blit.h new file mode 100644 index 0000000000..b3c94a25cd --- /dev/null +++ b/runtime/hsa-runtime/core/inc/blit.h @@ -0,0 +1,108 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef HSA_RUNTIME_CORE_INC_BLIT_H_ +#define HSA_RUNTIME_CORE_INC_BLIT_H_ + +#include + +#include "core/inc/agent.h" + +namespace core { +class Blit { + public: + explicit Blit() {} + virtual ~Blit() {} + + /// @brief Initialize a blit object. + /// + /// @param agent Pointer to the agent that will execute the blit commands. + /// + /// @return hsa_status_t + virtual hsa_status_t Initialize(const core::Agent& agent) = 0; + + /// @brief Marks the blit object as invalid and uncouples its link with + /// the underlying compute device's control block. Use of blit object + /// once it has been release is illegal and any behavior is indeterminate + /// + /// @note: The call will block until all commands have executed. + /// + /// @return hsa_status_t + virtual hsa_status_t Destroy() = 0; + + /// @brief Submit a linear copy command to the the underlying compute device's + /// control block. The call is blocking until the command execution is + /// finished. + /// + /// @param dst Memory address of the copy destination. + /// @param src Memory address of the copy source. + /// @param size Size of the data to be copied. + virtual hsa_status_t SubmitLinearCopyCommand(void* dst, const void* src, + size_t size) = 0; + + /// @brief Submit a linear copy command to the the underlying compute device's + /// control block. The call is non blocking. The memory transfer will start + /// after all dependent signals are satisfied. After the transfer is + /// completed, the out signal will be decremented. + /// + /// @param dst Memory address of the copy destination. + /// @param src Memory address of the copy source. + /// @param size Size of the data to be copied. + /// @param dep_signals Arrays of dependent signal. + /// @param out_signal Output signal. + virtual hsa_status_t SubmitLinearCopyCommand( + void* dst, const void* src, size_t size, + std::vector& dep_signals, core::Signal& out_signal) = 0; + + /// @brief Submit a linear fill command to the the underlying compute device's + /// control block. The call is blocking until the command execution is + /// finished. + /// + /// @param ptr Memory address of the fill destination. + /// @param value Value to be set. + /// @param num Number of uint32_t element to be set to the value. + virtual hsa_status_t SubmitLinearFillCommand(void* ptr, uint32_t value, + size_t num) = 0; +}; +} // namespace core + +#endif // header guard diff --git a/runtime/hsa-runtime/core/inc/checked.h b/runtime/hsa-runtime/core/inc/checked.h new file mode 100644 index 0000000000..d0ad2ff6d0 --- /dev/null +++ b/runtime/hsa-runtime/core/inc/checked.h @@ -0,0 +1,75 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef HSA_RUNTME_CORE_INC_CHECKED_H_ +#define HSA_RUNTME_CORE_INC_CHECKED_H_ + +#include "stdint.h" + +namespace core { + +/// @brief Base class for all classes whose validity can be checked using +/// IsValid() method. +template +class Checked { + public: + typedef Checked CheckedType; + + Checked() { object_ = uintptr_t(this) ^ uintptr_t(code); } + Checked(const Checked&) { object_ = uintptr_t(this) ^ uintptr_t(code); } + Checked(Checked&&) { object_ = uintptr_t(this) ^ uintptr_t(code); } + + virtual ~Checked() { object_ = NULL; } + + const Checked& operator=(Checked&& rhs) { return *this; } + const Checked& operator=(const Checked& rhs) { return *this; } + + bool IsValid() const { + return object_ == (uintptr_t(this) ^ uintptr_t(code)); + } + + private: + uintptr_t object_; +}; + +} // namespace core +#endif // header guard diff --git a/runtime/hsa-runtime/core/inc/default_signal.h b/runtime/hsa-runtime/core/inc/default_signal.h new file mode 100644 index 0000000000..f0f13eb06e --- /dev/null +++ b/runtime/hsa-runtime/core/inc/default_signal.h @@ -0,0 +1,174 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +// HSA runtime C++ interface file. + +#ifndef HSA_RUNTME_CORE_INC_DEFAULT_SIGNAL_H_ +#define HSA_RUNTME_CORE_INC_DEFAULT_SIGNAL_H_ + +#include "core/inc/runtime.h" +#include "core/inc/signal.h" +#include "core/util/utils.h" + +namespace core { + +/// @brief Simple pure memory based signal. +/// @brief See base class Signal. +class DefaultSignal : public Signal { + public: + /// @brief Determines if a Signal* can be safely converted to DefaultSignal* + /// via static_cast. + static __forceinline bool IsType(Signal* ptr) { + return ptr->IsType(&rtti_id_); + } + + /// @brief See base class Signal. + explicit DefaultSignal(hsa_signal_value_t initial_value); + + /// @brief See base class Signal. + ~DefaultSignal(); + + // Below are various methods corresponding to the APIs, which load/store the + // signal value or modify the existing signal value automically and with + // specified memory ordering semantics. + + hsa_signal_value_t LoadRelaxed(); + + hsa_signal_value_t LoadAcquire(); + + void StoreRelaxed(hsa_signal_value_t value); + + void StoreRelease(hsa_signal_value_t value); + + hsa_signal_value_t WaitRelaxed(hsa_signal_condition_t condition, + hsa_signal_value_t compare_value, + uint64_t timeout, hsa_wait_state_t wait_hint); + + hsa_signal_value_t WaitAcquire(hsa_signal_condition_t condition, + hsa_signal_value_t compare_value, + uint64_t timeout, hsa_wait_state_t wait_hint); + + void AndRelaxed(hsa_signal_value_t value); + + void AndAcquire(hsa_signal_value_t value); + + void AndRelease(hsa_signal_value_t value); + + void AndAcqRel(hsa_signal_value_t value); + + void OrRelaxed(hsa_signal_value_t value); + + void OrAcquire(hsa_signal_value_t value); + + void OrRelease(hsa_signal_value_t value); + + void OrAcqRel(hsa_signal_value_t value); + + void XorRelaxed(hsa_signal_value_t value); + + void XorAcquire(hsa_signal_value_t value); + + void XorRelease(hsa_signal_value_t value); + + void XorAcqRel(hsa_signal_value_t value); + + void AddRelaxed(hsa_signal_value_t value); + + void AddAcquire(hsa_signal_value_t value); + + void AddRelease(hsa_signal_value_t value); + + void AddAcqRel(hsa_signal_value_t value); + + void SubRelaxed(hsa_signal_value_t value); + + void SubAcquire(hsa_signal_value_t value); + + void SubRelease(hsa_signal_value_t value); + + void SubAcqRel(hsa_signal_value_t value); + + hsa_signal_value_t ExchRelaxed(hsa_signal_value_t value); + + hsa_signal_value_t ExchAcquire(hsa_signal_value_t value); + + hsa_signal_value_t ExchRelease(hsa_signal_value_t value); + + hsa_signal_value_t ExchAcqRel(hsa_signal_value_t value); + + hsa_signal_value_t CasRelaxed(hsa_signal_value_t expected, + hsa_signal_value_t value); + + hsa_signal_value_t CasAcquire(hsa_signal_value_t expected, + hsa_signal_value_t value); + + hsa_signal_value_t CasRelease(hsa_signal_value_t expected, + hsa_signal_value_t value); + + hsa_signal_value_t CasAcqRel(hsa_signal_value_t expected, + hsa_signal_value_t value); + + /// @brief see the base class Signal + __forceinline hsa_signal_value_t* ValueLocation() const { + return (hsa_signal_value_t*)&signal_.value; + } + + /// @brief see the base class Signal + __forceinline HsaEvent* EopEvent() { return NULL; } + + /// @brief prevent throwing exceptions + void* operator new(size_t size) { return malloc(size); } + + /// @brief prevent throwing exceptions + void operator delete(void* ptr) { free(ptr); } + + protected: + bool _IsA(rtti_t id) const { return id == &rtti_id_; } + + private: + static int rtti_id_; + + DISALLOW_COPY_AND_ASSIGN(DefaultSignal); +}; + +} // namespace core +#endif // header guard diff --git a/runtime/hsa-runtime/core/inc/host_queue.h b/runtime/hsa-runtime/core/inc/host_queue.h new file mode 100644 index 0000000000..e3ad022f7e --- /dev/null +++ b/runtime/hsa-runtime/core/inc/host_queue.h @@ -0,0 +1,167 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef HSA_RUNTIME_CORE_INC_HOST_QUEUE_H_ +#define HSA_RUNTIME_CORE_INC_HOST_QUEUE_H_ + +#include "core/inc/memory_region.h" +#include "core/inc/queue.h" +#include "core/inc/runtime.h" +#include "core/inc/signal.h" + +namespace core { +class HostQueue : public Queue { + public: + HostQueue(hsa_region_t region, uint32_t ring_size, hsa_queue_type_t type, + uint32_t features, hsa_signal_t doorbell_signal); + + ~HostQueue(); + + hsa_status_t Inactivate() { return HSA_STATUS_SUCCESS; } + + uint64_t LoadReadIndexAcquire() { + return atomic::Load(&amd_queue_.read_dispatch_id, + std::memory_order_acquire); + } + + uint64_t LoadReadIndexRelaxed() { + return atomic::Load(&amd_queue_.read_dispatch_id, + std::memory_order_relaxed); + } + + uint64_t LoadWriteIndexAcquire() { + return atomic::Load(&amd_queue_.write_dispatch_id, + std::memory_order_acquire); + } + + uint64_t LoadWriteIndexRelaxed() { + return atomic::Load(&amd_queue_.write_dispatch_id, + std::memory_order_relaxed); + } + + void StoreReadIndexRelaxed(uint64_t value) { + atomic::Store(&amd_queue_.read_dispatch_id, value, + std::memory_order_relaxed); + } + + void StoreReadIndexRelease(uint64_t value) { + atomic::Store(&amd_queue_.read_dispatch_id, value, + std::memory_order_release); + } + + void StoreWriteIndexRelaxed(uint64_t value) { + atomic::Store(&amd_queue_.write_dispatch_id, value, + std::memory_order_relaxed); + } + + void StoreWriteIndexRelease(uint64_t value) { + atomic::Store(&amd_queue_.write_dispatch_id, value, + std::memory_order_release); + } + + uint64_t CasWriteIndexAcqRel(uint64_t expected, uint64_t value) { + return atomic::Cas(&amd_queue_.write_dispatch_id, value, expected, + std::memory_order_acq_rel); + } + + uint64_t CasWriteIndexAcquire(uint64_t expected, uint64_t value) { + return atomic::Cas(&amd_queue_.write_dispatch_id, value, expected, + std::memory_order_acquire); + } + + uint64_t CasWriteIndexRelaxed(uint64_t expected, uint64_t value) { + return atomic::Cas(&amd_queue_.write_dispatch_id, value, expected, + std::memory_order_relaxed); + } + + uint64_t CasWriteIndexRelease(uint64_t expected, uint64_t value) { + return atomic::Cas(&amd_queue_.write_dispatch_id, value, expected, + std::memory_order_release); + } + + uint64_t AddWriteIndexAcqRel(uint64_t value) { + return atomic::Add(&amd_queue_.write_dispatch_id, value, + std::memory_order_acq_rel); + } + + uint64_t AddWriteIndexAcquire(uint64_t value) { + return atomic::Add(&amd_queue_.write_dispatch_id, value, + std::memory_order_acquire); + } + + uint64_t AddWriteIndexRelaxed(uint64_t value) { + return atomic::Add(&amd_queue_.write_dispatch_id, value, + std::memory_order_relaxed); + } + + uint64_t AddWriteIndexRelease(uint64_t value) { + return atomic::Add(&amd_queue_.write_dispatch_id, value, + std::memory_order_release); + } + + hsa_status_t SetCUMasking(const uint32_t num_cu_mask_count, + const uint32_t* cu_mask) { + return HSA_STATUS_ERROR; + } + + bool active() const { return active_; } + + void* operator new(size_t size) { + return _aligned_malloc(size, HSA_QUEUE_ALIGN_BYTES); + } + + void* operator new(size_t size, void* ptr) { return ptr; } + + void operator delete(void* ptr) { _aligned_free(ptr); } + + void operator delete(void*, void*) {} + + private: + static const size_t kRingAlignment = 256; + const uint32_t size_; + bool active_; + void* ring_; + + DISALLOW_COPY_AND_ASSIGN(HostQueue); +}; +} // namespace core +#endif // header guard diff --git a/runtime/hsa-runtime/core/inc/hsa_api_trace_int.h b/runtime/hsa-runtime/core/inc/hsa_api_trace_int.h new file mode 100644 index 0000000000..e4aa194342 --- /dev/null +++ b/runtime/hsa-runtime/core/inc/hsa_api_trace_int.h @@ -0,0 +1,63 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef HSA_RUNTIME_CORE_INC_HSA_API_TRACE_INT_H +#define HSA_RUNTIME_CORE_INC_HSA_API_TRACE_INT_H + +#include "inc/hsa_api_trace.h" +#include "core/inc/hsa_internal.h" + +namespace core { +struct ApiTable { + ::ApiTable table; + ExtTable extension_backup; + + ApiTable(); + void Reset(); + void LinkExts(ExtTable* ptr); +}; + +extern ApiTable hsa_api_table_; +extern ApiTable hsa_internal_api_table_; +} + +#endif diff --git a/runtime/hsa-runtime/core/inc/hsa_ext_interface.h b/runtime/hsa-runtime/core/inc/hsa_ext_interface.h new file mode 100644 index 0000000000..f0692e1440 --- /dev/null +++ b/runtime/hsa-runtime/core/inc/hsa_ext_interface.h @@ -0,0 +1,80 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef HSA_RUNTME_CORE_INC_AMD_EXT_INTERFACE_H_ +#define HSA_RUNTME_CORE_INC_AMD_EXT_INTERFACE_H_ + +#include +#include + +#include "hsa_api_trace_int.h" + +#include "core/util/os.h" +#include "core/util/utils.h" + +namespace core { +struct ExtTableInternal : public ExtTable { + decltype(::hsa_amd_image_get_info_max_dim)* hsa_amd_image_get_info_max_dim_fn; + decltype(::hsa_amd_image_create)* hsa_amd_image_create_fn; +}; + +class ExtensionEntryPoints { + public: + ExtTableInternal table; + + ExtensionEntryPoints(); + + bool Load(std::string library_name); + void Unload(); + + private: + typedef void (*Load_t)(const ::ApiTable* table); + typedef void (*Unload_t)(); + + std::vector libs_; + + void InitTable(); + DISALLOW_COPY_AND_ASSIGN(ExtensionEntryPoints); +}; +} + +#endif diff --git a/runtime/hsa-runtime/core/inc/hsa_internal.h b/runtime/hsa-runtime/core/inc/hsa_internal.h new file mode 100644 index 0000000000..e1d3806425 --- /dev/null +++ b/runtime/hsa-runtime/core/inc/hsa_internal.h @@ -0,0 +1,347 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef HSA_RUNTIME_CORE_INC_HSA_INTERNAL_H +#define HSA_RUNTIME_CORE_INC_HSA_INTERNAL_H + +#include "inc/hsa.h" + +namespace HSA +{ + + // Define core namespace interfaces - copy of function declarations in hsa.h + hsa_status_t HSA_API hsa_init(); + hsa_status_t HSA_API hsa_shut_down(); + hsa_status_t HSA_API + hsa_system_get_info(hsa_system_info_t attribute, void *value); + hsa_status_t HSA_API + hsa_system_extension_supported(uint16_t extension, uint16_t version_major, + uint16_t version_minor, bool *result); + hsa_status_t HSA_API + hsa_system_get_extension_table(uint16_t extension, uint16_t version_major, + uint16_t version_minor, void *table); + hsa_status_t HSA_API + hsa_iterate_agents(hsa_status_t (*callback)(hsa_agent_t agent, void *data), + void *data); + hsa_status_t HSA_API hsa_agent_get_info(hsa_agent_t agent, + hsa_agent_info_t attribute, + void *value); + hsa_status_t HSA_API hsa_agent_get_exception_policies(hsa_agent_t agent, + hsa_profile_t profile, + uint16_t *mask); + hsa_status_t HSA_API + hsa_agent_extension_supported(uint16_t extension, hsa_agent_t agent, + uint16_t version_major, + uint16_t version_minor, bool *result); + hsa_status_t HSA_API + hsa_queue_create(hsa_agent_t agent, uint32_t size, hsa_queue_type_t type, + void (*callback)(hsa_status_t status, hsa_queue_t *source, + void *data), + void *data, uint32_t private_segment_size, + uint32_t group_segment_size, hsa_queue_t **queue); + hsa_status_t HSA_API + hsa_soft_queue_create(hsa_region_t region, uint32_t size, + hsa_queue_type_t type, uint32_t features, + hsa_signal_t completion_signal, hsa_queue_t **queue); + hsa_status_t HSA_API hsa_queue_destroy(hsa_queue_t *queue); + hsa_status_t HSA_API hsa_queue_inactivate(hsa_queue_t *queue); + uint64_t HSA_API hsa_queue_load_read_index_acquire(const hsa_queue_t *queue); + uint64_t HSA_API hsa_queue_load_read_index_relaxed(const hsa_queue_t *queue); + uint64_t HSA_API hsa_queue_load_write_index_acquire(const hsa_queue_t *queue); + uint64_t HSA_API hsa_queue_load_write_index_relaxed(const hsa_queue_t *queue); + void HSA_API hsa_queue_store_write_index_relaxed(const hsa_queue_t *queue, + uint64_t value); + void HSA_API hsa_queue_store_write_index_release(const hsa_queue_t *queue, + uint64_t value); + uint64_t HSA_API hsa_queue_cas_write_index_acq_rel(const hsa_queue_t *queue, + uint64_t expected, + uint64_t value); + uint64_t HSA_API hsa_queue_cas_write_index_acquire(const hsa_queue_t *queue, + uint64_t expected, + uint64_t value); + uint64_t HSA_API hsa_queue_cas_write_index_relaxed(const hsa_queue_t *queue, + uint64_t expected, + uint64_t value); + uint64_t HSA_API hsa_queue_cas_write_index_release(const hsa_queue_t *queue, + uint64_t expected, + uint64_t value); + uint64_t HSA_API + hsa_queue_add_write_index_acq_rel(const hsa_queue_t *queue, uint64_t value); + uint64_t HSA_API + hsa_queue_add_write_index_acquire(const hsa_queue_t *queue, uint64_t value); + uint64_t HSA_API + hsa_queue_add_write_index_relaxed(const hsa_queue_t *queue, uint64_t value); + uint64_t HSA_API + hsa_queue_add_write_index_release(const hsa_queue_t *queue, uint64_t value); + void HSA_API hsa_queue_store_read_index_relaxed(const hsa_queue_t *queue, + uint64_t value); + void HSA_API hsa_queue_store_read_index_release(const hsa_queue_t *queue, + uint64_t value); + hsa_status_t HSA_API hsa_agent_iterate_regions( + hsa_agent_t agent, + hsa_status_t (*callback)(hsa_region_t region, void *data), void *data); + hsa_status_t HSA_API hsa_region_get_info(hsa_region_t region, + hsa_region_info_t attribute, + void *value); + hsa_status_t HSA_API hsa_memory_register(void *address, size_t size); + hsa_status_t HSA_API hsa_memory_deregister(void *address, size_t size); + hsa_status_t HSA_API + hsa_memory_allocate(hsa_region_t region, size_t size, void **ptr); + hsa_status_t HSA_API hsa_memory_free(void *ptr); + hsa_status_t HSA_API hsa_memory_copy(void *dst, const void *src, size_t size); + hsa_status_t HSA_API hsa_memory_assign_agent(void *ptr, hsa_agent_t agent, + hsa_access_permission_t access); + hsa_status_t HSA_API + hsa_signal_create(hsa_signal_value_t initial_value, uint32_t num_consumers, + const hsa_agent_t *consumers, hsa_signal_t *signal); + hsa_status_t HSA_API hsa_signal_destroy(hsa_signal_t signal); + hsa_signal_value_t HSA_API hsa_signal_load_relaxed(hsa_signal_t signal); + hsa_signal_value_t HSA_API hsa_signal_load_acquire(hsa_signal_t signal); + void HSA_API + hsa_signal_store_relaxed(hsa_signal_t signal, hsa_signal_value_t value); + void HSA_API + hsa_signal_store_release(hsa_signal_t signal, hsa_signal_value_t value); + hsa_signal_value_t HSA_API + hsa_signal_wait_relaxed(hsa_signal_t signal, + hsa_signal_condition_t condition, + hsa_signal_value_t compare_value, + uint64_t timeout_hint, + hsa_wait_state_t wait_expectancy_hint); + hsa_signal_value_t HSA_API + hsa_signal_wait_acquire(hsa_signal_t signal, + hsa_signal_condition_t condition, + hsa_signal_value_t compare_value, + uint64_t timeout_hint, + hsa_wait_state_t wait_expectancy_hint); + void HSA_API + hsa_signal_and_relaxed(hsa_signal_t signal, hsa_signal_value_t value); + void HSA_API + hsa_signal_and_acquire(hsa_signal_t signal, hsa_signal_value_t value); + void HSA_API + hsa_signal_and_release(hsa_signal_t signal, hsa_signal_value_t value); + void HSA_API + hsa_signal_and_acq_rel(hsa_signal_t signal, hsa_signal_value_t value); + void HSA_API + hsa_signal_or_relaxed(hsa_signal_t signal, hsa_signal_value_t value); + void HSA_API + hsa_signal_or_acquire(hsa_signal_t signal, hsa_signal_value_t value); + void HSA_API + hsa_signal_or_release(hsa_signal_t signal, hsa_signal_value_t value); + void HSA_API + hsa_signal_or_acq_rel(hsa_signal_t signal, hsa_signal_value_t value); + void HSA_API + hsa_signal_xor_relaxed(hsa_signal_t signal, hsa_signal_value_t value); + void HSA_API + hsa_signal_xor_acquire(hsa_signal_t signal, hsa_signal_value_t value); + void HSA_API + hsa_signal_xor_release(hsa_signal_t signal, hsa_signal_value_t value); + void HSA_API + hsa_signal_xor_acq_rel(hsa_signal_t signal, hsa_signal_value_t value); + void HSA_API + hsa_signal_add_relaxed(hsa_signal_t signal, hsa_signal_value_t value); + void HSA_API + hsa_signal_add_acquire(hsa_signal_t signal, hsa_signal_value_t value); + void HSA_API + hsa_signal_add_release(hsa_signal_t signal, hsa_signal_value_t value); + void HSA_API + hsa_signal_add_acq_rel(hsa_signal_t signal, hsa_signal_value_t value); + void HSA_API + hsa_signal_subtract_relaxed(hsa_signal_t signal, hsa_signal_value_t value); + void HSA_API + hsa_signal_subtract_acquire(hsa_signal_t signal, hsa_signal_value_t value); + void HSA_API + hsa_signal_subtract_release(hsa_signal_t signal, hsa_signal_value_t value); + void HSA_API + hsa_signal_subtract_acq_rel(hsa_signal_t signal, hsa_signal_value_t value); + hsa_signal_value_t HSA_API + hsa_signal_exchange_relaxed(hsa_signal_t signal, hsa_signal_value_t value); + hsa_signal_value_t HSA_API + hsa_signal_exchange_acquire(hsa_signal_t signal, hsa_signal_value_t value); + hsa_signal_value_t HSA_API + hsa_signal_exchange_release(hsa_signal_t signal, hsa_signal_value_t value); + hsa_signal_value_t HSA_API + hsa_signal_exchange_acq_rel(hsa_signal_t signal, hsa_signal_value_t value); + hsa_signal_value_t HSA_API hsa_signal_cas_relaxed(hsa_signal_t signal, + hsa_signal_value_t expected, + hsa_signal_value_t value); + hsa_signal_value_t HSA_API hsa_signal_cas_acquire(hsa_signal_t signal, + hsa_signal_value_t expected, + hsa_signal_value_t value); + hsa_signal_value_t HSA_API hsa_signal_cas_release(hsa_signal_t signal, + hsa_signal_value_t expected, + hsa_signal_value_t value); + hsa_signal_value_t HSA_API hsa_signal_cas_acq_rel(hsa_signal_t signal, + hsa_signal_value_t expected, + hsa_signal_value_t value); + hsa_status_t hsa_isa_from_name( + const char *name, + hsa_isa_t *isa + ); + hsa_status_t HSA_API hsa_isa_get_info( + hsa_isa_t isa, + hsa_isa_info_t attribute, + uint32_t index, + void *value + ); + hsa_status_t hsa_isa_compatible( + hsa_isa_t code_object_isa, + hsa_isa_t agent_isa, + bool *result + ); + hsa_status_t HSA_API hsa_code_object_serialize( + hsa_code_object_t code_object, + hsa_status_t (*alloc_callback)( + size_t size, hsa_callback_data_t data, void **address + ), + hsa_callback_data_t callback_data, + const char *options, + void **serialized_code_object, + size_t *serialized_code_object_size + ); + hsa_status_t HSA_API hsa_code_object_deserialize( + void *serialized_code_object, + size_t serialized_code_object_size, + const char *options, + hsa_code_object_t *code_object + ); + hsa_status_t HSA_API hsa_code_object_destroy( + hsa_code_object_t code_object + ); + hsa_status_t HSA_API hsa_code_object_get_info( + hsa_code_object_t code_object, + hsa_code_object_info_t attribute, + void *value + ); + hsa_status_t HSA_API hsa_code_object_get_symbol( + hsa_code_object_t code_object, + const char *symbol_name, + hsa_code_symbol_t *symbol + ); + hsa_status_t HSA_API hsa_code_symbol_get_info( + hsa_code_symbol_t code_symbol, + hsa_code_symbol_info_t attribute, + void *value + ); + hsa_status_t HSA_API hsa_code_object_iterate_symbols( + hsa_code_object_t code_object, + hsa_status_t (*callback)( + hsa_code_object_t code_object, hsa_code_symbol_t symbol, void *data + ), + void *data + ); + hsa_status_t HSA_API hsa_executable_create( + hsa_profile_t profile, + hsa_executable_state_t executable_state, + const char *options, + hsa_executable_t *executable + ); + hsa_status_t HSA_API hsa_executable_destroy( + hsa_executable_t executable + ); + hsa_status_t HSA_API hsa_executable_load_code_object( + hsa_executable_t executable, + hsa_agent_t agent, + hsa_code_object_t code_object, + const char *options + ); + hsa_status_t HSA_API hsa_executable_freeze( + hsa_executable_t executable, + const char *options + ); + hsa_status_t HSA_API hsa_executable_get_info( + hsa_executable_t executable, + hsa_executable_info_t attribute, + void *value + ); + hsa_status_t HSA_API hsa_executable_global_variable_define( + hsa_executable_t executable, + const char *variable_name, + void *address + ); + hsa_status_t HSA_API hsa_executable_agent_global_variable_define( + hsa_executable_t executable, + hsa_agent_t agent, + const char *variable_name, + void *address + ); + hsa_status_t HSA_API hsa_executable_readonly_variable_define( + hsa_executable_t executable, + hsa_agent_t agent, + const char *variable_name, + void *address + ); + hsa_status_t HSA_API hsa_executable_validate( + hsa_executable_t executable, + uint32_t *result + ); + hsa_status_t HSA_API hsa_executable_get_symbol( + hsa_executable_t executable, + const char *module_name, + const char *symbol_name, + hsa_agent_t agent, + int32_t call_convention, + hsa_executable_symbol_t *symbol + ); + hsa_status_t HSA_API hsa_executable_symbol_get_info( + hsa_executable_symbol_t executable_symbol, + hsa_executable_symbol_info_t attribute, + void *value + ); + hsa_status_t HSA_API hsa_executable_iterate_symbols( + hsa_executable_t executable, + hsa_status_t (*callback)( + hsa_executable_t executable, hsa_executable_symbol_t symbol, void *data + ), + void *data + ); + hsa_status_t HSA_API + hsa_status_string(hsa_status_t status, const char **status_string); + +} + +#ifdef BUILDING_HSA_CORE_RUNTIME +//This using declaration is deliberate! +//We want unqualified name resolution to fail when building the runtime. This is a guard against accidental use of the intercept layer in the runtime. +using namespace HSA; +#endif + +#endif diff --git a/runtime/hsa-runtime/core/inc/hsa_table_interface.h b/runtime/hsa-runtime/core/inc/hsa_table_interface.h new file mode 100644 index 0000000000..8769de8825 --- /dev/null +++ b/runtime/hsa-runtime/core/inc/hsa_table_interface.h @@ -0,0 +1,47 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#include "hsa_api_trace.h" + +void hsa_table_interface_init(const ApiTable* table); + +const ApiTable* hsa_table_interface_get_table(); diff --git a/runtime/hsa-runtime/core/inc/interrupt_signal.h b/runtime/hsa-runtime/core/inc/interrupt_signal.h new file mode 100644 index 0000000000..19c2d59642 --- /dev/null +++ b/runtime/hsa-runtime/core/inc/interrupt_signal.h @@ -0,0 +1,206 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +// HSA runtime C++ interface file. + +#ifndef HSA_RUNTME_CORE_INC_INTERRUPT_SIGNAL_H_ +#define HSA_RUNTME_CORE_INC_INTERRUPT_SIGNAL_H_ + +#include "hsakmt.h" + +#include "core/inc/runtime.h" +#include "core/inc/signal.h" +#include "core/util/utils.h" + +namespace core { + +/// @brief A Signal implementation using interrupts versus plain memory based. +/// Also see base class Signal. +/// +/// Breaks common/vendor separation - signals in general needs to be re-worked +/// at the foundation level to make sense in a multi-device system. +/// Supports only one waiter for now. +/// KFD changes are needed to support multiple waiters and have device +/// signaling. +class InterruptSignal : public Signal { + public: + static HsaEvent* CreateEvent(HSA_EVENTTYPE type, bool manual_reset); + static void DestroyEvent(HsaEvent* evt); + + /// @brief Determines if a Signal* can be safely converted to an + /// InterruptSignal* via static_cast. + static __forceinline bool IsType(Signal* ptr) { + return ptr->IsType(&rtti_id_); + } + + explicit InterruptSignal(hsa_signal_value_t initial_value, + HsaEvent* use_event = NULL); + + ~InterruptSignal(); + + // Below are various methods corresponding to the APIs, which load/store the + // signal value or modify the existing signal value automically and with + // specified memory ordering semantics. + + hsa_signal_value_t LoadRelaxed(); + + hsa_signal_value_t LoadAcquire(); + + void StoreRelaxed(hsa_signal_value_t value); + + void StoreRelease(hsa_signal_value_t value); + + hsa_signal_value_t WaitRelaxed(hsa_signal_condition_t condition, + hsa_signal_value_t compare_value, + uint64_t timeout, hsa_wait_state_t wait_hint); + + hsa_signal_value_t WaitAcquire(hsa_signal_condition_t condition, + hsa_signal_value_t compare_value, + uint64_t timeout, hsa_wait_state_t wait_hint); + + void AndRelaxed(hsa_signal_value_t value); + + void AndAcquire(hsa_signal_value_t value); + + void AndRelease(hsa_signal_value_t value); + + void AndAcqRel(hsa_signal_value_t value); + + void OrRelaxed(hsa_signal_value_t value); + + void OrAcquire(hsa_signal_value_t value); + + void OrRelease(hsa_signal_value_t value); + + void OrAcqRel(hsa_signal_value_t value); + + void XorRelaxed(hsa_signal_value_t value); + + void XorAcquire(hsa_signal_value_t value); + + void XorRelease(hsa_signal_value_t value); + + void XorAcqRel(hsa_signal_value_t value); + + void AddRelaxed(hsa_signal_value_t value); + + void AddAcquire(hsa_signal_value_t value); + + void AddRelease(hsa_signal_value_t value); + + void AddAcqRel(hsa_signal_value_t value); + + void SubRelaxed(hsa_signal_value_t value); + + void SubAcquire(hsa_signal_value_t value); + + void SubRelease(hsa_signal_value_t value); + + void SubAcqRel(hsa_signal_value_t value); + + hsa_signal_value_t ExchRelaxed(hsa_signal_value_t value); + + hsa_signal_value_t ExchAcquire(hsa_signal_value_t value); + + hsa_signal_value_t ExchRelease(hsa_signal_value_t value); + + hsa_signal_value_t ExchAcqRel(hsa_signal_value_t value); + + hsa_signal_value_t CasRelaxed(hsa_signal_value_t expected, + hsa_signal_value_t value); + + hsa_signal_value_t CasAcquire(hsa_signal_value_t expected, + hsa_signal_value_t value); + + hsa_signal_value_t CasRelease(hsa_signal_value_t expected, + hsa_signal_value_t value); + + hsa_signal_value_t CasAcqRel(hsa_signal_value_t expected, + hsa_signal_value_t value); + + /// @brief See base class Signal. + __forceinline hsa_signal_value_t* ValueLocation() const { + return (hsa_signal_value_t*)&signal_.value; + } + + /// @brief See base class Signal. + __forceinline HsaEvent* EopEvent() { return event_; } + + // TODO(bwicakso) : work around for SDMA async copy. Bypass waiting on EOP + // event because SDMA copy does not handle interrupt yet. + __forceinline void DisableWaitEvent() { wait_on_event_ = false; } + + /// @brief prevent throwing exceptions + void* operator new(size_t size) { return malloc(size); } + + /// @brief prevent throwing exceptions + void operator delete(void* ptr) { free(ptr); } + + protected: + bool _IsA(rtti_t id) const { return id == &rtti_id_; } + + private: + /// @variable KFD event on which the interrupt signal is based on. + HsaEvent* event_; + + /// @variable Indicates whether the signal should release the event when it + /// closes or not. + bool free_event_; + + // TODO(bwicakso) : work around for SDMA async copy. Bypass waiting on EOP + // event because SDMA copy does not handle interrupt yet. + bool wait_on_event_; + + /// Used to obtain a globally unique value (address) for rtti. + static int rtti_id_; + + /// @brief Notify driver of signal value change if necessary. + __forceinline void SetEvent() { + std::atomic_signal_fence(std::memory_order_seq_cst); + if (InWaiting()) hsaKmtSetEvent(event_); + } + + DISALLOW_COPY_AND_ASSIGN(InterruptSignal); +}; + +} // namespace core +#endif // header guard diff --git a/runtime/hsa-runtime/core/inc/isa.h b/runtime/hsa-runtime/core/inc/isa.h new file mode 100644 index 0000000000..46cdc85a1d --- /dev/null +++ b/runtime/hsa-runtime/core/inc/isa.h @@ -0,0 +1,164 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef HSA_RUNTIME_CORE_ISA_H_ +#define HSA_RUNTIME_CORE_ISA_H_ + +#include +#include +#include +#include +#include +#include "core/inc/amd_hsa_code.hpp" + +namespace core { + +// @class Isa +// @brief Instruction Set Architecture +class Isa final: public amd::hsa::common::Signed<0xB13594F2BD8F212D> { + public: + // @brief Isa's version type + typedef std::tuple Version; + + // @brief Default destructor + ~Isa() {} + + // @returns Handle equivalent of @p isa_object + static hsa_isa_t Handle(const Isa *isa_object) { + hsa_isa_t isa_handle = { reinterpret_cast(isa_object) }; + return isa_handle; + } + // @returns Object equivalend of @p isa_handle + static Isa *Object(const hsa_isa_t &isa_handle) { + Isa *isa_object = amd::hsa::common::ObjectAt(isa_handle.handle); + return isa_object; + } + + // @returns This Isa's version + const Version &version() const { + return version_; + } + + // @returns This Isa's vendor + std::string GetVendor() const { + return "AMD"; + } + // @returns This Isa's architecture + std::string GetArchitecture() const { + return "AMDGPU"; + } + // @returns This Isa's major version + int32_t GetMajorVersion() const { + return std::get<0>(version_); + } + // @returns This Isa's minor version + int32_t GetMinorVersion() const { + return std::get<1>(version_); + } + // @returns This Isa's stepping + int32_t GetStepping() const { + return std::get<2>(version_); + } + + // @returns True if this Isa is compatible with @p isa_object, false otherwise + bool IsCompatible(const Isa *isa_object) const { + assert(isa_object); + return version_ == isa_object->version_; + } + // @returns True if this Isa is compatible with @p isa_handle, false otherwise + bool IsCompatible(const hsa_isa_t &isa_handle) const { + assert(isa_handle.handle); + return IsCompatible(Object(isa_handle)); + } + // @brief Isa is always in valid state + bool IsValid() const { + return true; + } + + // @returns This Isa's full name + std::string GetFullName() const; + + // @brief Query value of requested @p attribute and record it in @p value + bool GetInfo(const hsa_isa_info_t &attribute, void *value) const; + + private: + // @brief Default constructor + Isa(): version_(Version(-1, -1, -1)) {} + + // @brief Construct from @p version + Isa(const Version &version): version_(version) {} + + // @brief Isa's version + Version version_; + + // @brief Isa's friends + friend class IsaRegistry; +}; // class Isa + +// @class IsaRegistry +// @brief Instruction Set Architecture Registry +class IsaRegistry final { + public: + // @returns Isa for requested @p full_name, null pointer if not supported + static const Isa *GetIsa(const std::string &full_name); + // @returns Isa for requested @p version, null pointer if not supported + static const Isa *GetIsa(const Isa::Version &version); + + private: + // @brief IsaRegistry's map type + typedef std::unordered_map IsaMap; + + // @brief Supported instruction set architectures + static const IsaMap supported_isas_; + + // @brief Default constructor - not available + IsaRegistry(); + // @brief Default destructor - not available + ~IsaRegistry(); + + // @returns Supported instruction set architectures + static const IsaMap GetSupportedIsas(); +}; // class IsaRegistry + +} // namespace core + +#endif // HSA_RUNTIME_CORE_ISA_HPP_ diff --git a/runtime/hsa-runtime/core/inc/memory_region.h b/runtime/hsa-runtime/core/inc/memory_region.h new file mode 100644 index 0000000000..ea37b6a2bc --- /dev/null +++ b/runtime/hsa-runtime/core/inc/memory_region.h @@ -0,0 +1,109 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +// HSA runtime C++ interface file. + +#ifndef HSA_RUNTME_CORE_INC_MEMORY_REGION_H_ +#define HSA_RUNTME_CORE_INC_MEMORY_REGION_H_ + +#include + +#include "core/inc/runtime.h" +#include "core/inc/agent.h" +#include "core/inc/checked.h" + +namespace core { +class Agent; + +class MemoryRegion : public Checked<0x9C961F19EE175BB3> { + public: + MemoryRegion(bool fine_grain, bool full_profile, core::Agent* owner) + : fine_grain_(fine_grain), full_profile_(full_profile), owner_(owner) { + assert(owner_ != NULL); + } + + virtual ~MemoryRegion() {} + + // Convert this object into hsa_region_t. + static __forceinline hsa_region_t Convert(MemoryRegion* region) { + const hsa_region_t region_handle = { + static_cast(reinterpret_cast(region))}; + return region_handle; + } + + static __forceinline const hsa_region_t Convert(const MemoryRegion* region) { + const hsa_region_t region_handle = { + static_cast(reinterpret_cast(region))}; + return region_handle; + } + + // Convert hsa_region_t into MemoryRegion *. + static __forceinline MemoryRegion* Convert(hsa_region_t region) { + return reinterpret_cast(region.handle); + } + + virtual hsa_status_t Allocate(size_t size, void** address) const = 0; + + virtual hsa_status_t Free(void* address, size_t size) const = 0; + + // Translate memory properties into HSA region attribute. + virtual hsa_status_t GetInfo(hsa_region_info_t attribute, + void* value) const = 0; + + virtual hsa_status_t AssignAgent(void* ptr, size_t size, const Agent& agent, + hsa_access_permission_t access) const = 0; + + __forceinline bool fine_grain() const { return fine_grain_; } + + __forceinline bool full_profile() const { return full_profile_; } + + __forceinline core::Agent* owner() const { return owner_; } + + private: + const bool fine_grain_; + const bool full_profile_; + + core::Agent* owner_; +}; +} // namespace core + +#endif // header guard diff --git a/runtime/hsa-runtime/core/inc/queue.h b/runtime/hsa-runtime/core/inc/queue.h new file mode 100644 index 0000000000..c1a56bded5 --- /dev/null +++ b/runtime/hsa-runtime/core/inc/queue.h @@ -0,0 +1,322 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +// HSA runtime C++ interface file. + +#ifndef HSA_RUNTME_CORE_INC_COMMAND_QUEUE_H_ +#define HSA_RUNTME_CORE_INC_COMMAND_QUEUE_H_ +#include + +#include "core/common/shared.h" + +#include "core/inc/runtime.h" +#include "core/inc/checked.h" + +#include "core/util/utils.h" + +#include "inc/amd_hsa_queue.h" + +namespace core { +struct AqlPacket { + + union { + hsa_kernel_dispatch_packet_t dispatch; + hsa_barrier_and_packet_t barrier_and; + hsa_barrier_or_packet_t barrier_or; + hsa_agent_dispatch_packet_t agent; + }; + + uint8_t type() { + return ((dispatch.header >> HSA_PACKET_HEADER_TYPE) & + ((1 << HSA_PACKET_HEADER_WIDTH_TYPE) - 1)); + } + + bool IsValid() { + const uint8_t packet_type = dispatch.header >> HSA_PACKET_HEADER_TYPE; + return (packet_type > HSA_PACKET_TYPE_INVALID && + packet_type <= HSA_PACKET_TYPE_BARRIER_OR); + } + + std::string string() const { + std::stringstream string; + uint8_t type = ((dispatch.header >> HSA_PACKET_HEADER_TYPE) & + ((1 << HSA_PACKET_HEADER_WIDTH_TYPE) - 1)); + + const char* type_names[] = { + "HSA_PACKET_TYPE_VENDOR_SPECIFIC", "HSA_PACKET_TYPE_INVALID", + "HSA_PACKET_TYPE_KERNEL_DISPATCH", "HSA_PACKET_TYPE_BARRIER_AND", + "HSA_PACKET_TYPE_AGENT_DISPATCH", "HSA_PACKET_TYPE_BARRIER_OR"}; + + string << "type: " << type_names[type] + << "\nbarrier: " << ((dispatch.header >> HSA_PACKET_HEADER_BARRIER) & + ((1 << HSA_PACKET_HEADER_WIDTH_BARRIER) - 1)) + << "\nacquire: " + << ((dispatch.header >> HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) & + ((1 << HSA_PACKET_HEADER_WIDTH_ACQUIRE_FENCE_SCOPE) - 1)) + << "\nrelease: " + << ((dispatch.header >> HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE) & + ((1 << HSA_PACKET_HEADER_WIDTH_RELEASE_FENCE_SCOPE) - 1)); + + if (type == HSA_PACKET_TYPE_KERNEL_DISPATCH) { + string << "\nDim: " << dispatch.setup + << "\nworkgroup_size: " << dispatch.workgroup_size_x << ", " + << dispatch.workgroup_size_y << ", " << dispatch.workgroup_size_z + << "\ngrid_size: " << dispatch.grid_size_x << ", " + << dispatch.grid_size_y << ", " << dispatch.grid_size_z + << "\nprivate_size: " << dispatch.private_segment_size + << "\ngroup_size: " << dispatch.group_segment_size + << "\nkernel_object: " << dispatch.kernel_object + << "\nkern_arg: " << dispatch.kernarg_address + << "\nsignal: " << dispatch.completion_signal.handle; + } + + if ((type == HSA_PACKET_TYPE_BARRIER_AND) || + (type == HSA_PACKET_TYPE_BARRIER_OR)) { + for (int i = 0; i < 5; i++) + string << "\ndep[" << i << "]: " << barrier_and.dep_signal[i].handle; + string << "\nsignal: " << barrier_and.completion_signal.handle; + } + + return string.str(); + } +}; + +class Queue; + +/// @brief Helper structure to simplify conversion of amd_queue_t and +/// core::Queue object. +struct SharedQueue { + amd_queue_t amd_queue; + Queue* core_queue; +}; + +/// @brief Class Queue which encapsulate user mode queues and +/// provides Api to access its Read, Write indices using Acquire, +/// Release and Relaxed semantics. +/* +Queue is intended to be an pure interface class and may be wrapped or replaced +by tools. +All funtions other than Convert and public_handle must be virtual. +*/ +class Queue : public Checked<0xFA3906A679F9DB49>, + public Shared { + public: + Queue() : Shared(), amd_queue_(shared_object()->amd_queue) { + if (!Shared::IsSharedObjectAllocationValid()) { + return; + } + + shared_object()->core_queue = this; + + public_handle_ = Convert(this); + } + + virtual ~Queue() {} + + /// @brief Returns the handle of Queue's public data type + /// + /// @param queue Pointer to an instance of Queue implementation object + /// + /// @return hsa_queue_t * Pointer to the public data type of a queue + static __forceinline hsa_queue_t* Convert(Queue* queue) { + return ((queue != NULL) && (queue->IsSharedObjectAllocationValid())) + ? &queue->amd_queue_.hsa_queue + : NULL; + } + + /// @brief Transform the public data type of a Queue's data type into an + // instance of it Queue class object + /// + /// @param queue Handle of public data type of a queue + /// + /// @return Queue * Pointer to the Queue's implementation object + static __forceinline Queue* Convert(const hsa_queue_t* queue) { + return (queue != NULL) + ? reinterpret_cast( + reinterpret_cast(queue) - + (reinterpret_cast( + &reinterpret_cast(1234) + ->amd_queue.hsa_queue) - + uintptr_t(1234)))->core_queue + : NULL; + } + + /// @brief Inactivate the queue object. Once inactivate a + /// queue cannot be used anymore and must be destroyed + /// + /// @return hsa_status_t Status of request + virtual hsa_status_t Inactivate() = 0; + + /// @brief Reads the Read Index of Queue using Acquire semantics + /// + /// @return uint64_t Value of Read index + virtual uint64_t LoadReadIndexAcquire() = 0; + + /// @brief Reads the Read Index of Queue using Relaxed semantics + /// + /// @return uint64_t Value of Read index + virtual uint64_t LoadReadIndexRelaxed() = 0; + + /// @brief Reads the Write Index of Queue using Acquire semantics + /// + /// @return uint64_t Value of Write index + virtual uint64_t LoadWriteIndexAcquire() = 0; + + /// Reads the Write Index of Queue using Relaxed semantics + /// + /// @return uint64_t Value of Write index + virtual uint64_t LoadWriteIndexRelaxed() = 0; + + /// @brief Updates the Read Index of Queue using Relaxed semantics + /// + /// @param value New value of Read index to update + virtual void StoreReadIndexRelaxed(uint64_t value) = 0; + + /// @brief Updates the Read Index of Queue using Release semantics + /// + /// @param value New value of Read index to update + virtual void StoreReadIndexRelease(uint64_t value) = 0; + + /// @brief Updates the Write Index of Queue using Relaxed semantics + /// + /// @param value New value of Write index to update + virtual void StoreWriteIndexRelaxed(uint64_t value) = 0; + + /// @brief Updates the Write Index of Queue using Release semantics + /// + /// @param value New value of Write index to update + virtual void StoreWriteIndexRelease(uint64_t value) = 0; + + /// @brief Compares and swaps Write index using Acquire and Release semantics + /// + /// @param expected Current value of write index + /// + /// @param value Value of new write index + /// + /// @return uint64_t Value of write index before the update + virtual uint64_t CasWriteIndexAcqRel(uint64_t expected, uint64_t value) = 0; + + /// @brief Compares and swaps Write index using Acquire semantics + /// + /// @param expected Current value of write index + /// + /// @param value Value of new write index + /// + /// @return uint64_t Value of write index before the update + virtual uint64_t CasWriteIndexAcquire(uint64_t expected, uint64_t value) = 0; + + /// @brief Compares and swaps Write index using Relaxed semantics + /// + /// @param expected Current value of write index + /// + /// @param value Value of new write index + /// + /// @return uint64_t Value of write index before the update + virtual uint64_t CasWriteIndexRelaxed(uint64_t expected, uint64_t value) = 0; + + /// @brief Compares and swaps Write index using Release semantics + /// + /// @param expected Current value of write index + /// + /// @param value Value of new write index + /// + /// @return uint64_t Value of write index before the update + virtual uint64_t CasWriteIndexRelease(uint64_t expected, uint64_t value) = 0; + + /// @brief Updates the Write index using Acquire and Release semantics + /// + /// @param value Value of new write index + /// + /// @return uint64_t Value of write index before the update + virtual uint64_t AddWriteIndexAcqRel(uint64_t value) = 0; + + /// @brief Updates the Write index using Acquire semantics + /// + /// @param value Value of new write index + /// + /// @return uint64_t Value of write index before the update + virtual uint64_t AddWriteIndexAcquire(uint64_t value) = 0; + + /// @brief Updates the Write index using Relaxed semantics + /// + /// @param value Value of new write index + /// + /// @return uint64_t Value of write index before the update + virtual uint64_t AddWriteIndexRelaxed(uint64_t value) = 0; + + /// @brief Updates the Write index using Release semantics + /// + /// @param value Value of new write index + /// + /// @return uint64_t Value of write index before the update + virtual uint64_t AddWriteIndexRelease(uint64_t value) = 0; + + /// @brief Set CU Masking + /// + /// @param num_cu_mask_count size of mask bit array + /// + /// @param cu_mask pointer to cu mask + /// + /// @return hsa_status_t + virtual hsa_status_t SetCUMasking(const uint32_t num_cu_mask_count, + const uint32_t* cu_mask) = 0; + + // Handle of AMD Queue struct + amd_queue_t& amd_queue_; + + hsa_queue_t* public_handle() const { return public_handle_; } + + protected: + static void set_public_handle(Queue* ptr, hsa_queue_t* handle) { + ptr->do_set_public_handle(handle); + } + virtual void do_set_public_handle(hsa_queue_t* handle) { + public_handle_ = handle; + } + hsa_queue_t* public_handle_; + + private: + DISALLOW_COPY_AND_ASSIGN(Queue); +}; +} + +#endif // header guard diff --git a/runtime/hsa-runtime/core/inc/registers.h b/runtime/hsa-runtime/core/inc/registers.h new file mode 100644 index 0000000000..d2bffb654d --- /dev/null +++ b/runtime/hsa-runtime/core/inc/registers.h @@ -0,0 +1,204 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +// This file is used only for open source cmake builds, if we hardcode the +// register values in amd_aql_queue.cpp then this file won't be required. For +// now we are using this file where register details are spelled out in the +// structs/unions below. +#ifndef HSA_RUNTME_CORE_INC_REGISTERS_H_ +#define HSA_RUNTME_CORE_INC_REGISTERS_H_ + +typedef enum SQ_RSRC_BUF_TYPE { +SQ_RSRC_BUF = 0x00000000, +SQ_RSRC_BUF_RSVD_1 = 0x00000001, +SQ_RSRC_BUF_RSVD_2 = 0x00000002, +SQ_RSRC_BUF_RSVD_3 = 0x00000003, +} SQ_RSRC_BUF_TYPE; + +typedef enum BUF_DATA_FORMAT { +BUF_DATA_FORMAT_INVALID = 0x00000000, +BUF_DATA_FORMAT_8 = 0x00000001, +BUF_DATA_FORMAT_16 = 0x00000002, +BUF_DATA_FORMAT_8_8 = 0x00000003, +BUF_DATA_FORMAT_32 = 0x00000004, +BUF_DATA_FORMAT_16_16 = 0x00000005, +BUF_DATA_FORMAT_10_11_11 = 0x00000006, +BUF_DATA_FORMAT_11_11_10 = 0x00000007, +BUF_DATA_FORMAT_10_10_10_2 = 0x00000008, +BUF_DATA_FORMAT_2_10_10_10 = 0x00000009, +BUF_DATA_FORMAT_8_8_8_8 = 0x0000000a, +BUF_DATA_FORMAT_32_32 = 0x0000000b, +BUF_DATA_FORMAT_16_16_16_16 = 0x0000000c, +BUF_DATA_FORMAT_32_32_32 = 0x0000000d, +BUF_DATA_FORMAT_32_32_32_32 = 0x0000000e, +BUF_DATA_FORMAT_RESERVED_15 = 0x0000000f, +} BUF_DATA_FORMAT; + +typedef enum BUF_NUM_FORMAT { +BUF_NUM_FORMAT_UNORM = 0x00000000, +BUF_NUM_FORMAT_SNORM = 0x00000001, +BUF_NUM_FORMAT_USCALED = 0x00000002, +BUF_NUM_FORMAT_SSCALED = 0x00000003, +BUF_NUM_FORMAT_UINT = 0x00000004, +BUF_NUM_FORMAT_SINT = 0x00000005, +BUF_NUM_FORMAT_SNORM_OGL__SI__CI = 0x00000006, +BUF_NUM_FORMAT_RESERVED_6__VI = 0x00000006, +BUF_NUM_FORMAT_FLOAT = 0x00000007, +} BUF_NUM_FORMAT; + +typedef enum SQ_SEL_XYZW01 { +SQ_SEL_0 = 0x00000000, +SQ_SEL_1 = 0x00000001, +SQ_SEL_RESERVED_0 = 0x00000002, +SQ_SEL_RESERVED_1 = 0x00000003, +SQ_SEL_X = 0x00000004, +SQ_SEL_Y = 0x00000005, +SQ_SEL_Z = 0x00000006, +SQ_SEL_W = 0x00000007, +} SQ_SEL_XYZW01; + + union COMPUTE_TMPRING_SIZE { + struct { +#if defined(LITTLEENDIAN_CPU) + unsigned int WAVES : 12; + unsigned int WAVESIZE : 13; + unsigned int : 7; +#elif defined(BIGENDIAN_CPU) + unsigned int : 7; + unsigned int WAVESIZE : 13; + unsigned int WAVES : 12; +#endif + } bitfields, bits; + unsigned int u32All; + signed int i32All; + float f32All; + }; + + + union SQ_BUF_RSRC_WORD0 { + struct { +#if defined(LITTLEENDIAN_CPU) + unsigned int BASE_ADDRESS : 32; +#elif defined(BIGENDIAN_CPU) + unsigned int BASE_ADDRESS : 32; +#endif + } bitfields, bits; + unsigned int u32All; + signed int i32All; + float f32All; + }; + + + union SQ_BUF_RSRC_WORD1 { + struct { +#if defined(LITTLEENDIAN_CPU) + unsigned int BASE_ADDRESS_HI : 16; + unsigned int STRIDE : 14; + unsigned int CACHE_SWIZZLE : 1; + unsigned int SWIZZLE_ENABLE : 1; +#elif defined(BIGENDIAN_CPU) + unsigned int SWIZZLE_ENABLE : 1; + unsigned int CACHE_SWIZZLE : 1; + unsigned int STRIDE : 14; + unsigned int BASE_ADDRESS_HI : 16; +#endif + } bitfields, bits; + unsigned int u32All; + signed int i32All; + float f32All; + }; + + + union SQ_BUF_RSRC_WORD2 { + struct { +#if defined(LITTLEENDIAN_CPU) + unsigned int NUM_RECORDS : 32; +#elif defined(BIGENDIAN_CPU) + unsigned int NUM_RECORDS : 32; +#endif + } bitfields, bits; + unsigned int u32All; + signed int i32All; + float f32All; + }; + + + union SQ_BUF_RSRC_WORD3 { + struct { +#if defined(LITTLEENDIAN_CPU) + unsigned int DST_SEL_X : 3; + unsigned int DST_SEL_Y : 3; + unsigned int DST_SEL_Z : 3; + unsigned int DST_SEL_W : 3; + unsigned int NUM_FORMAT : 3; + unsigned int DATA_FORMAT : 4; + unsigned int ELEMENT_SIZE : 2; + unsigned int INDEX_STRIDE : 2; + unsigned int ADD_TID_ENABLE : 1; + unsigned int ATC__CI__VI : 1; + unsigned int HASH_ENABLE : 1; + unsigned int HEAP : 1; + unsigned int MTYPE__CI__VI : 3; + unsigned int TYPE : 2; +#elif defined(BIGENDIAN_CPU) + unsigned int TYPE : 2; + unsigned int MTYPE__CI__VI : 3; + unsigned int HEAP : 1; + unsigned int HASH_ENABLE : 1; + unsigned int ATC__CI__VI : 1; + unsigned int ADD_TID_ENABLE : 1; + unsigned int INDEX_STRIDE : 2; + unsigned int ELEMENT_SIZE : 2; + unsigned int DATA_FORMAT : 4; + unsigned int NUM_FORMAT : 3; + unsigned int DST_SEL_W : 3; + unsigned int DST_SEL_Z : 3; + unsigned int DST_SEL_Y : 3; + unsigned int DST_SEL_X : 3; +#endif + } bitfields, bits; + unsigned int u32All; + signed int i32All; + float f32All; + }; + +#endif // header guard diff --git a/runtime/hsa-runtime/core/inc/runtime.h b/runtime/hsa-runtime/core/inc/runtime.h new file mode 100644 index 0000000000..c59a6ee0d2 --- /dev/null +++ b/runtime/hsa-runtime/core/inc/runtime.h @@ -0,0 +1,498 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +// HSA runtime C++ interface file. + +#ifndef HSA_RUNTME_CORE_INC_RUNTIME_H_ +#define HSA_RUNTME_CORE_INC_RUNTIME_H_ + +#include +#include + +#include "core/inc/hsa_ext_interface.h" +#include "core/inc/hsa_internal.h" + +#include "core/inc/agent.h" +#include "core/inc/memory_region.h" +#include "core/inc/signal.h" +#include "core/util/utils.h" +#include "core/util/locks.h" +#include "core/util/os.h" + +#include "core/inc/amd_loader_context.hpp" +#include "amd_hsa_code.hpp" + +//---------------------------------------------------------------------------// +// Constants // +//---------------------------------------------------------------------------// + +#define HSA_ARGUMENT_ALIGN_BYTES 16 +#define HSA_QUEUE_ALIGN_BYTES 64 +#define HSA_PACKET_ALIGN_BYTES 64 + +namespace core { +extern bool g_use_interrupt_wait; + +/// @brief Runtime class provides the following functions: +/// - open and close connection to kernel driver. +/// - load supported extension library (image and finalizer). +/// - load tools library. +/// - expose supported agents. +/// - allocate and free memory. +/// - memory copy and fill. +/// - grant access to memory (dgpu memory pool extension). +/// - maintain loader state. +/// - monitor asynchronous event from agent. +class Runtime { + public: + /// @brief Structure to describe connectivity between agents. + struct LinkInfo { + uint32_t num_hop; + hsa_amd_memory_pool_link_info_t info; + }; + + /// @brief Open connection to kernel driver and increment reference count. + /// @retval True if the connection to kernel driver is successfully opened. + static bool Acquire(); + + /// @brief Checks if connection to kernel driver is opened. + /// @retval True if the connection to kernel driver is opened. + static bool IsOpen(); + + // @brief Callback handler for VM fault access. + static bool VMFaultHandler(hsa_signal_value_t val, void* arg); + + /// @brief Singleton object of the runtime. + static Runtime* runtime_singleton_; + + /// @brief Decrement reference count and close connection to kernel driver. + /// @retval True if reference count is larger than 0. + bool Release(); + + /// @brief Insert agent into agent list ::agents_. + /// @param [in] agent Pointer to the agent object. + void RegisterAgent(Agent* agent); + + /// @brief Delete all agent objects from ::agents_. + void DestroyAgents(); + + /// @brief Set the number of links connecting the agents in the platform. + void SetLinkCount(size_t num_link); + + /// @brief Register link information connecting @p node_id_from and @p + /// node_id_to. + /// @param [in] node_id_from Node id of the source node. + /// @param [in] node_id_to Node id of the destination node. + /// @param [in] link_info The link information between source and destination + /// nodes. + void RegisterLinkInfo(uint32_t node_id_from, uint32_t node_id_to, + uint32_t num_hop, + hsa_amd_memory_pool_link_info_t& link_info); + + /// @brief Query link information between two nodes. + /// @param [in] node_id_from Node id of the source node. + /// @param [in] node_id_to Node id of the destination node. + /// @retval The link information between source and destination nodes. + const LinkInfo& GetLinkInfo(uint32_t node_id_from, uint32_t node_id_to); + + /// @brief Invoke the user provided call back for each agent in the agent + /// list. + /// + /// @param [in] callback User provided callback function. + /// @param [in] data User provided pointer as input for @p callback. + /// + /// @retval ::HSA_STATUS_SUCCESS if the callback function for each traversed + /// agent returns ::HSA_STATUS_SUCCESS. + hsa_status_t IterateAgent(hsa_status_t (*callback)(hsa_agent_t agent, + void* data), + void* data); + + /// @brief Allocate memory on a particular region. + /// + /// @param [in] region Pointer to region object. + /// @param [in] size Allocation size in bytes. + /// @param [out] address Pointer to store the allocation result. + /// + /// @retval ::HSA_STATUS_SUCCESS If allocation is successful. + hsa_status_t AllocateMemory(const MemoryRegion* region, size_t size, + void** address); + + /// @brief Allocate memory on a particular region with option to restrict + /// access to the owning agent. + /// + /// @param [in] restrict_access If true, the allocation result would only be + /// accessible to the agent(s) that own the region object. + /// @param [in] region Pointer to region object. + /// @param [in] size Allocation size in bytes. + /// @param [out] address Pointer to store the allocation result. + /// + /// @retval ::HSA_STATUS_SUCCESS If allocation is successful. + hsa_status_t AllocateMemory(bool restrict_access, const MemoryRegion* region, + size_t size, void** address); + + /// @brief Free memory previously allocated with AllocateMemory. + /// + /// @param [in] ptr Address of the memory to be freed. + /// + /// @retval ::HSA_STATUS_ERROR If @p ptr is not the address of previous + /// allocation via ::core::Runtime::AllocateMemory + /// @retval ::HSA_STATUS_SUCCESS if @p ptr is successfully released. + hsa_status_t FreeMemory(void* ptr); + + /// @brief Blocking memory copy from src to dst. + /// + /// @param [in] dst Memory address of the destination. + /// @param [in] src Memory address of the source. + /// @param [in] size Copy size in bytes. + /// + /// @retval ::HSA_STATUS_SUCCESS if memory copy is successful and completed. + hsa_status_t CopyMemory(void* dst, const void* src, size_t size); + + /// @brief Non-blocking memory copy from src to dst. + /// + /// @details The memory copy will be performed after all signals in + /// @p dep_signals have value of 0. On completion @p completion_signal + /// will be decremented. + /// + /// @param [in] dst Memory address of the destination. + /// @param [in] dst_agent Agent object associated with the destination. This + /// agent should be able to access the destination and source. + /// @param [in] src Memory address of the source. + /// @param [in] src_agent Agent object associated with the source. This + /// agent should be able to access the destination and source. + /// @param [in] size Copy size in bytes. + /// @param [in] dep_signals Array of signal dependency. + /// @param [in] completion_signal Completion signal object. + /// + /// @retval ::HSA_STATUS_SUCCESS if copy command has been submitted + /// successfully to the agent DMA queue. + hsa_status_t CopyMemory(void* dst, core::Agent& dst_agent, const void* src, + core::Agent& src_agent, size_t size, + std::vector& dep_signals, + core::Signal& completion_signal); + + /// @brief Fill the first @p count of uint32_t in ptr with value. + /// + /// @param [in] ptr Memory address to be filled. + /// @param [in] value The value/pattern that will be used to set @p ptr. + /// @param [in] count Number of uint32_t element to be set. + /// + /// @retval ::HSA_STATUS_SUCCESS if memory fill is successful and completed. + hsa_status_t FillMemory(void* ptr, uint32_t value, size_t count); + + /// @brief Set agents as the whitelist to access ptr. + /// + /// @param [in] num_agents The number of agent handles in @p agents array. + /// @param [in] agents Agent handle array. + /// @param [in] ptr Pointer of memory previously allocated via + /// core::Runtime::AllocateMemory. + /// + /// @retval ::HSA_STATUS_SUCCESS The whitelist has been configured + /// successfully and all agents in the @p agents could start accessing @p ptr. + hsa_status_t AllowAccess(uint32_t num_agents, const hsa_agent_t* agents, + const void* ptr); + + /// @brief Query system information. + /// + /// @param [in] attribute System info attribute to query. + /// @param [out] value Pointer to store the attribute value. + /// + /// @retval HSA_STATUS_SUCCESS The attribute is valid and the @p value is + /// set. + hsa_status_t GetSystemInfo(hsa_system_info_t attribute, void* value); + + /// @brief Query next available queue id. + /// + /// @retval Next available queue id. + uint32_t GetQueueId(); + + /// @brief Register a callback function @p handler that is associated with + /// @p signal to asynchronous event monitor thread. + /// + /// @param [in] signal Signal handle associated with @p handler. + /// @param [in] cond The condition to execute the @p handler. + /// @param [in] value The value to compare with @p signal value. If the + /// comparison satisfy @p cond, the @p handler will be called. + /// @param [in] arg Pointer to the argument that will be provided to @p + /// handler. + /// + /// @retval ::HSA_STATUS_SUCCESS Registration is successful. + hsa_status_t SetAsyncSignalHandler(hsa_signal_t signal, + hsa_signal_condition_t cond, + hsa_signal_value_t value, + hsa_amd_signal_handler handler, void* arg); + + hsa_status_t InteropMap(uint32_t num_agents, Agent** agents, + int interop_handle, uint32_t flags, size_t* size, + void** ptr, size_t* metadata_size, + const void** metadata); + + hsa_status_t InteropUnmap(void* ptr); + + const std::vector& cpu_agents() { return cpu_agents_; } + + const std::vector& gpu_agents() { return gpu_agents_; } + + + const std::vector& gpu_ids() { return gpu_ids_; } + + Agent* blit_agent() { return blit_agent_; } + + Agent* host_agent() { return host_agent_; } + + const std::vector& system_regions_fine() const { + return system_regions_fine_; + } + + const std::vector& system_regions_coarse() const { + return system_regions_coarse_; + } + + amd::hsa::loader::Loader* loader() { return loader_; } + + amd::LoaderContext* loader_context() { return &loader_context_; } + + amd::hsa::code::AmdHsaCodeManager* code_manager() { return &code_manager_; } + + std::function& system_allocator() { + return system_allocator_; + } + + std::function& system_deallocator() { + return system_deallocator_; + } + + ExtensionEntryPoints extensions_; + + protected: + static void AsyncEventsLoop(void*); + + struct AllocationRegion { + AllocationRegion() : region(NULL), assigned_agent_(NULL), size(0) {} + AllocationRegion(const MemoryRegion* region_arg, size_t size_arg) + : region(region_arg), assigned_agent_(NULL), size(size_arg) {} + + const MemoryRegion* region; + const Agent* assigned_agent_; + size_t size; + }; + + struct AsyncEventsControl { + AsyncEventsControl() : async_events_thread_(NULL) {} + void Shutdown(); + + hsa_signal_t wake; + os::Thread async_events_thread_; + KernelMutex lock; + bool exit; + }; + + struct AsyncEvents { + void PushBack(hsa_signal_t signal, hsa_signal_condition_t cond, + hsa_signal_value_t value, hsa_amd_signal_handler handler, + void* arg); + + void CopyIndex(size_t dst, size_t src); + + size_t Size(); + + void PopBack(); + + void Clear(); + + std::vector signal_; + std::vector cond_; + std::vector value_; + std::vector handler_; + std::vector arg_; + }; + + // Will be created before any user could call hsa_init but also could be + // destroyed before incorrectly written programs call hsa_shutdown. + static KernelMutex bootstrap_lock_; + + Runtime(); + + Runtime(const Runtime&); + + Runtime& operator=(const Runtime&); + + ~Runtime() {} + + /// @brief Open connection to kernel driver. + void Load(); + + /// @brief Close connection to kernel driver and cleanup resources. + void Unload(); + + /// @brief Dynamically load extension libraries (images, finalizer) and + /// call OnLoad method on each loaded library. + void LoadExtensions(); + + /// @brief Call OnUnload method on each extension library then close it. + void UnloadExtensions(); + + /// @brief Dynamically load tool libraries and call OnUnload method on each + /// loaded library. + void LoadTools(); + + /// @brief Call OnUnload method of each tool library. + void UnloadTools(); + + /// @brief Close tool libraries. + void CloseTools(); + + // @brief Binds virtual memory access fault handler to this node. + void BindVmFaultHandler(); + + /// @brief Blocking memory copy from src to dst. One of the src or dst + /// is user pointer. A particular setup need to be made if the DMA queue + /// for the memory copy belongs to a dGPU agent. E.g: pin the user pointer + /// before copying, or using a staging buffer. + /// + /// @param [in] dst Memory address of the destination. + /// @param [in] src Memory address of the source. + /// @param [in] size Copy size in bytes. + /// @param [in] dst_malloc If true, then @p dst is the user pointer. Otherwise + /// @p src is the user pointer. + /// + /// @retval ::HSA_STATUS_SUCCESS if memory copy is successful and completed. + hsa_status_t CopyMemoryHostAlloc(void* dst, const void* src, size_t size, + bool dst_malloc); + + /// @brief Get the index of ::link_matrix_. + /// @param [in] node_id_from Node id of the source node. + /// @param [in] node_id_to Node id of the destination node. + /// @retval Index in ::link_matrix_. + uint32_t GetIndexLinkInfo(uint32_t node_id_from, uint32_t node_id_to); + + // Mutex object to protect multithreaded access to ::Acquire and ::Release. + KernelMutex kernel_lock_; + + // Mutex object to protect multithreaded access to ::allocation_map_. + KernelMutex memory_lock_; + + // Array containing tools library handles. + std::vector tool_libs_; + + // Agent list containing all CPU agents in the platform. + std::vector cpu_agents_; + + // Agent list containing all compatible GPU agents in the platform. + std::vector gpu_agents_; + + // Agent list containing all compatible gpu agent ids in the platform. + std::vector gpu_ids_; + + // List of all fine grain system memory region in the platform. + std::vector system_regions_fine_; + + // List of all coarse grain system memory region in the platform. + std::vector system_regions_coarse_; + + // Matrix of IO link. + std::vector link_matrix_; + + // Loader instance. + amd::hsa::loader::Loader* loader_; + + // Loader context. + amd::LoaderContext loader_context_; + + // Code object manager. + amd::hsa::code::AmdHsaCodeManager code_manager_; + + // Contains the region, address, and size of previously allocated memory. + std::map allocation_map_; + + // Allocator using ::system_region_ + std::function system_allocator_; + + // Deallocator using ::system_region_ + std::function system_deallocator_; + + // Pointer to a host/cpu agent object. + Agent* host_agent_; + + // Pointer to DMA agent. + Agent* blit_agent_; + + AsyncEventsControl async_events_control_; + + AsyncEvents async_events_; + + AsyncEvents new_async_events_; + + // Queue id counter. + uint32_t queue_count_; + + // Starting address of SVM address space. + // On APU the cpu and gpu could access the area inside starting and end of + // the SVM address space. + // On dGPU, only the gpu is guaranteed to have access to the area inside the + // SVM address space, since it maybe backed by private gpu VRAM. + uintptr_t start_svm_address_; + + // End address of SVM address space. + // start_svm_address_ + size + uintptr_t end_svm_address_; + + // System clock frequency. + uint64_t sys_clock_freq_; + + // @brief AMD HSA event to monitor for virtual memory access fault. + HsaEvent* vm_fault_event_; + + // @brief HSA signal to contain the VM fault event. + Signal* vm_fault_signal_; + + // Holds reference count to runtime object. + volatile uint32_t ref_count_; + + // Frees runtime memory when the runtime library is unloaded if safe to do so. + // Failure to release the runtime indicates an incorrect application but is + // common (example: calls library routines at process exit). + friend class RuntimeCleanup; +}; + +} // namespace core +#endif // header guard diff --git a/runtime/hsa-runtime/core/inc/signal.h b/runtime/hsa-runtime/core/inc/signal.h new file mode 100644 index 0000000000..e6509421cc --- /dev/null +++ b/runtime/hsa-runtime/core/inc/signal.h @@ -0,0 +1,269 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +// HSA runtime C++ interface file. + +#ifndef HSA_RUNTME_CORE_INC_SIGNAL_H_ +#define HSA_RUNTME_CORE_INC_SIGNAL_H_ + +#include "hsakmt.h" + +#include "core/common/shared.h" + +#include "core/inc/runtime.h" +#include "core/inc/checked.h" + +#include "core/util/utils.h" + +#include "inc/amd_hsa_signal.h" + +namespace core { +class Signal; + +/// @brief Helper structure to simplify conversion of amd_signal_t and +/// core::Signal object. +struct SharedSignal { + amd_signal_t amd_signal; + Signal* core_signal; +}; + +/// @brief An abstract base class which helps implement the public hsa_signal_t +/// type (an opaque handle) and its associated APIs. At its core, signal uses +/// a 32 or 64 bit value. This value can be waitied on or signaled atomically +/// using specified memory ordering semantics. +class Signal : public Checked<0x71FCCA6A3D5D5276>, + public Shared { + public: + /// @brief Constructor initializes the signal with initial value. + explicit Signal(hsa_signal_value_t initial_value) + : Shared(), signal_(shared_object()->amd_signal) { + if (!Shared::IsSharedObjectAllocationValid()) { + invalid_ = true; + return; + } + + shared_object()->core_signal = this; + + signal_.kind = AMD_SIGNAL_KIND_INVALID; + signal_.value = initial_value; + invalid_ = false; + waiting_ = 0; + retained_ = 0; + } + + virtual ~Signal() { signal_.kind = AMD_SIGNAL_KIND_INVALID; } + + bool IsValid() const { + if (CheckedType::IsValid() && !invalid_) return true; + return false; + } + + /// @brief Converts from this implementation class to the public + /// hsa_signal_t type - an opaque handle. + static __forceinline hsa_signal_t Convert(Signal* signal) { + const uint64_t handle = + (signal != NULL && signal->IsValid()) + ? static_cast( + reinterpret_cast(&signal->signal_)) + : 0; + const hsa_signal_t signal_handle = {handle}; + return signal_handle; + } + + /// @brief Converts from this implementation class to the public + /// hsa_signal_t type - an opaque handle. + static __forceinline const hsa_signal_t Convert(const Signal* signal) { + const uint64_t handle = + (signal != NULL && signal->IsValid()) + ? static_cast( + reinterpret_cast(&signal->signal_)) + : 0; + const hsa_signal_t signal_handle = {handle}; + return signal_handle; + } + + /// @brief Converts from public hsa_signal_t type (an opaque handle) to + /// this implementation class object. + static __forceinline Signal* Convert(hsa_signal_t signal) { + return (signal.handle != 0) + ? reinterpret_cast( + static_cast(signal.handle) - + (reinterpret_cast( + &reinterpret_cast(1234)->amd_signal) - + uintptr_t(1234)))->core_signal + : NULL; + } + + // Below are various methods corresponding to the APIs, which load/store the + // signal value or modify the existing signal value automically and with + // specified memory ordering semantics. + virtual hsa_signal_value_t LoadRelaxed() = 0; + virtual hsa_signal_value_t LoadAcquire() = 0; + + virtual void StoreRelaxed(hsa_signal_value_t value) = 0; + virtual void StoreRelease(hsa_signal_value_t value) = 0; + + virtual hsa_signal_value_t WaitRelaxed(hsa_signal_condition_t condition, + hsa_signal_value_t compare_value, + uint64_t timeout, + hsa_wait_state_t wait_hint) = 0; + virtual hsa_signal_value_t WaitAcquire(hsa_signal_condition_t condition, + hsa_signal_value_t compare_value, + uint64_t timeout, + hsa_wait_state_t wait_hint) = 0; + + virtual void AndRelaxed(hsa_signal_value_t value) = 0; + virtual void AndAcquire(hsa_signal_value_t value) = 0; + virtual void AndRelease(hsa_signal_value_t value) = 0; + virtual void AndAcqRel(hsa_signal_value_t value) = 0; + + virtual void OrRelaxed(hsa_signal_value_t value) = 0; + virtual void OrAcquire(hsa_signal_value_t value) = 0; + virtual void OrRelease(hsa_signal_value_t value) = 0; + virtual void OrAcqRel(hsa_signal_value_t value) = 0; + + virtual void XorRelaxed(hsa_signal_value_t value) = 0; + virtual void XorAcquire(hsa_signal_value_t value) = 0; + virtual void XorRelease(hsa_signal_value_t value) = 0; + virtual void XorAcqRel(hsa_signal_value_t value) = 0; + + virtual void AddRelaxed(hsa_signal_value_t value) = 0; + virtual void AddAcquire(hsa_signal_value_t value) = 0; + virtual void AddRelease(hsa_signal_value_t value) = 0; + virtual void AddAcqRel(hsa_signal_value_t value) = 0; + + virtual void SubRelaxed(hsa_signal_value_t value) = 0; + virtual void SubAcquire(hsa_signal_value_t value) = 0; + virtual void SubRelease(hsa_signal_value_t value) = 0; + virtual void SubAcqRel(hsa_signal_value_t value) = 0; + + virtual hsa_signal_value_t ExchRelaxed(hsa_signal_value_t value) = 0; + virtual hsa_signal_value_t ExchAcquire(hsa_signal_value_t value) = 0; + virtual hsa_signal_value_t ExchRelease(hsa_signal_value_t value) = 0; + virtual hsa_signal_value_t ExchAcqRel(hsa_signal_value_t value) = 0; + + virtual hsa_signal_value_t CasRelaxed(hsa_signal_value_t expected, + hsa_signal_value_t value) = 0; + virtual hsa_signal_value_t CasAcquire(hsa_signal_value_t expected, + hsa_signal_value_t value) = 0; + virtual hsa_signal_value_t CasRelease(hsa_signal_value_t expected, + hsa_signal_value_t value) = 0; + virtual hsa_signal_value_t CasAcqRel(hsa_signal_value_t expected, + hsa_signal_value_t value) = 0; + + //------------------------- + // implementation specific + //------------------------- + typedef void* rtti_t; + + /// @brief Returns the address of the value. + virtual hsa_signal_value_t* ValueLocation() const = 0; + + /// @brief Applies only to InterrupEvent type, returns the event used to. + /// Returns NULL for DefaultEvent Type. + virtual HsaEvent* EopEvent() = 0; + + /// @brief Waits until any signal in the list satisfies its condition or + /// timeout is reached. + /// Returns the index of a satisfied signal. Returns -1 on timeout and + /// errors. + static uint32_t WaitAny(uint32_t signal_count, hsa_signal_t* hsa_signals, + hsa_signal_condition_t* conds, + hsa_signal_value_t* values, uint64_t timeout_hint, + hsa_wait_state_t wait_hint, + hsa_signal_value_t* satisfying_value); + + __forceinline bool IsType(rtti_t id) { return _IsA(id); } + + /// @brief Allows special case interaction with signal destruction cleanup. + void Retain() { atomic::Increment(&retained_); } + void Release() { atomic::Decrement(&retained_); } + + /// @brief Checks if signal is currently in use such that it should not be + /// deleted. + bool InUse() const { return (retained_ != 0) || (waiting_ != 0); } + + /// @brief Checks if signal is currently in use by a wait API. + bool InWaiting() const { return waiting_ != 0; } + + /// @brief Structure which defines key signal elements like type and value. + /// Address of this struct is used as a value for the opaque handle of type + /// hsa_signal_t provided to the public API. + amd_signal_t& signal_; + + protected: + /// @brief Simple RTTI type checking helper + /// Returns true if the object can be converted to the query type via + /// static_cast. + /// Do not use directly. Use IsType in the desired derived type instead. + virtual bool _IsA(rtti_t id) const = 0; + + /// @variable Indicates if signal is valid or not. + volatile bool invalid_; + + /// @variable Indicates number of runtime threads waiting on this signal. + /// Value of zero means no waits. + volatile uint32_t waiting_; + + volatile uint32_t retained_; + + private: + DISALLOW_COPY_AND_ASSIGN(Signal); +}; + +struct hsa_signal_handle { + hsa_signal_t signal; + + hsa_signal_handle() {} + hsa_signal_handle(hsa_signal_t Signal) { signal = Signal; } + operator hsa_signal_t() { return signal; } + Signal* operator->() { return core::Signal::Convert(signal); } +}; +static_assert( + sizeof(hsa_signal_handle) == sizeof(hsa_signal_t), + "hsa_signal_handle and hsa_signal_t must have identical binary layout."); +static_assert( + sizeof(hsa_signal_handle[2]) == sizeof(hsa_signal_t[2]), + "hsa_signal_handle and hsa_signal_t must have identical binary layout."); + +} // namespace core +#endif // header guard diff --git a/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp b/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp new file mode 100644 index 0000000000..870410a69c --- /dev/null +++ b/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp @@ -0,0 +1,856 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#include "core/inc/amd_aql_queue.h" + +#ifdef __linux__ +#include +#include +#include +#include +#include +#endif + +#ifdef _WIN32 +#include +#endif + +#include +#include + +#include "core/inc/runtime.h" +#include "core/inc/amd_memory_region.h" +#include "core/inc/signal.h" +#include "core/inc/queue.h" +#include "core/util/utils.h" +#include "core/inc/registers.h" +#include "core/inc/interrupt_signal.h" + +namespace amd { +// Queue::amd_queue_ is cache-aligned for performance. +const uint32_t kAmdQueueAlignBytes = 0x40; + +HsaEvent* AqlQueue::queue_event_ = NULL; +volatile uint32_t AqlQueue::queue_count_ = 0; +KernelMutex AqlQueue::queue_lock_; +int AqlQueue::rtti_id_; + +void* AqlQueue::operator new(size_t size) { + // Align base to 64B to enforce amd_queue_ member alignment. + return _aligned_malloc(size, kAmdQueueAlignBytes); +} + +void AqlQueue::operator delete(void* ptr) { _aligned_free(ptr); } + +AqlQueue::AqlQueue(GpuAgent* agent, size_t req_size_pkts, HSAuint32 node_id, + ScratchInfo& scratch, core::HsaEventCallback callback, + void* err_data, bool is_kv) + : Queue(), + Signal(0), + ring_buf_(NULL), + ring_buf_alloc_bytes_(0), + queue_id_(HSA_QUEUEID(-1)), + valid_(false), + agent_(agent), + queue_scratch_(scratch), + errors_callback_(callback), + errors_data_(err_data), + is_kv_queue_(is_kv) { + if (!Queue::Shared::IsSharedObjectAllocationValid()) { + return; + } + + hsa_status_t stat = agent_->GetInfo(HSA_AGENT_INFO_PROFILE, &agent_profile_); + assert(stat == HSA_STATUS_SUCCESS); + + const core::Isa* isa = agent_->isa(); + + // When queue_full_workaround_ is set to 1, the ring buffer is internally + // doubled in size. Virtual addresses in the upper half of the ring allocation + // are mapped to the same set of pages backing the lower half. + // Values written to the HW doorbell are modulo the doubled size. + // This allows the HW to accept (doorbell == last_doorbell + queue_size). + // This workaround is required for GFXIP 7 and GFXIP 8 ASICs. + queue_full_workaround_ = + (isa->GetMajorVersion() == 7 || isa->GetMajorVersion() == 8) + ? 1 + : 0; + + // Identify doorbell semantics for this agent. + doorbell_type_ = agent->properties().Capability.ui32.DoorbellType; + + // Queue size is a function of several restrictions. + const uint32_t min_pkts = ComputeRingBufferMinPkts(); + const uint32_t max_pkts = ComputeRingBufferMaxPkts(); + + // Apply sizing constraints to the ring buffer. + uint32_t queue_size_pkts = uint32_t(req_size_pkts); + queue_size_pkts = Min(queue_size_pkts, max_pkts); + queue_size_pkts = Max(queue_size_pkts, min_pkts); + + uint32_t queue_size_bytes = queue_size_pkts * sizeof(core::AqlPacket); + if ((queue_size_bytes & (queue_size_bytes - 1)) != 0) return; + + // Allocate the AQL packet ring buffer. + AllocRegisteredRingBuffer(queue_size_pkts); + if (ring_buf_ == NULL) return; + MAKE_NAMED_SCOPE_GUARD(RingGuard, [&]() { FreeRegisteredRingBuffer(); }); + + // Fill the ring buffer with ALWAYS_RESERVED packet headers. + // Leave packet content uninitialized to help track errors. + for (uint32_t pkt_id = 0; pkt_id < queue_size_pkts; ++pkt_id) { + ((uint32_t*)ring_buf_)[16 * pkt_id] = HSA_PACKET_TYPE_INVALID; + } + + // Zero the amd_queue_ structure to clear RPTR/WPTR before queue attach. + memset(&amd_queue_, 0, sizeof(amd_queue_)); + + // Initialize and map a HW AQL queue. + HsaQueueResource queue_rsrc = {0}; + queue_rsrc.Queue_read_ptr_aql = (uint64_t*)&amd_queue_.read_dispatch_id; + queue_rsrc.Queue_write_ptr_aql = + (uint64_t*)&amd_queue_.max_legacy_doorbell_dispatch_id_plus_1; + + HSAKMT_STATUS kmt_status; + kmt_status = hsaKmtCreateQueue(node_id, HSA_QUEUE_COMPUTE_AQL, 100, + HSA_QUEUE_PRIORITY_NORMAL, ring_buf_, + ring_buf_alloc_bytes_, NULL, &queue_rsrc); + if (kmt_status != HSAKMT_STATUS_SUCCESS) return; + queue_id_ = queue_rsrc.QueueId; + MAKE_NAMED_SCOPE_GUARD(QueueGuard, [&]() { hsaKmtDestroyQueue(queue_id_); }); + + // Populate doorbell signal structure. + memset(&signal_, 0, sizeof(signal_)); + signal_.kind = AMD_SIGNAL_KIND_LEGACY_DOORBELL; + signal_.legacy_hardware_doorbell_ptr = + (volatile uint32_t*)queue_rsrc.Queue_DoorBell; + signal_.queue_ptr = &amd_queue_; + + // Populate amd_queue_ structure. + amd_queue_.hsa_queue.type = HSA_QUEUE_TYPE_MULTI; + amd_queue_.hsa_queue.features = HSA_QUEUE_FEATURE_KERNEL_DISPATCH; + amd_queue_.hsa_queue.base_address = ring_buf_; + amd_queue_.hsa_queue.doorbell_signal = Signal::Convert(this); + amd_queue_.hsa_queue.size = queue_size_pkts; + amd_queue_.hsa_queue.id = core::Runtime::runtime_singleton_->GetQueueId(); + amd_queue_.read_dispatch_id_field_base_byte_offset = uint32_t( + uintptr_t(&amd_queue_.read_dispatch_id) - uintptr_t(&amd_queue_)); + + const auto& props = agent->properties(); + amd_queue_.max_cu_id = (props.NumFComputeCores / props.NumSIMDPerCU) - 1; + amd_queue_.max_wave_id = props.MaxWavesPerSIMD - 1; + +#ifdef HSA_LARGE_MODEL + AMD_HSA_BITS_SET(amd_queue_.queue_properties, AMD_QUEUE_PROPERTIES_IS_PTR64, + 1); +#else + AMD_HSA_BITS_SET(amd_queue_.queue_properties, AMD_QUEUE_PROPERTIES_IS_PTR64, + 0); +#endif + + // Populate scratch resource descriptor in amd_queue_. + SQ_BUF_RSRC_WORD0 srd0; + SQ_BUF_RSRC_WORD1 srd1; + SQ_BUF_RSRC_WORD2 srd2; + SQ_BUF_RSRC_WORD3 srd3; + uintptr_t scratch_base = uintptr_t(queue_scratch_.queue_base); + uint32_t scratch_base_hi = 0; + +#ifdef HSA_LARGE_MODEL + scratch_base_hi = uint32_t(scratch_base >> 32); +#endif + + srd0.bits.BASE_ADDRESS = uint32_t(scratch_base); + srd1.bits.BASE_ADDRESS_HI = scratch_base_hi; + srd1.bits.STRIDE = 0; + srd1.bits.CACHE_SWIZZLE = 0; + srd1.bits.SWIZZLE_ENABLE = 1; + srd2.bits.NUM_RECORDS = uint32_t(queue_scratch_.size); + srd3.bits.DST_SEL_X = SQ_SEL_X; + srd3.bits.DST_SEL_Y = SQ_SEL_Y; + srd3.bits.DST_SEL_Z = SQ_SEL_Z; + srd3.bits.DST_SEL_W = SQ_SEL_W; + srd3.bits.NUM_FORMAT = BUF_NUM_FORMAT_UINT; + srd3.bits.DATA_FORMAT = BUF_DATA_FORMAT_32; + srd3.bits.ELEMENT_SIZE = 1; // 4 + srd3.bits.INDEX_STRIDE = 3; // 64 + srd3.bits.ADD_TID_ENABLE = 1; + srd3.bits.ATC__CI__VI = (agent_profile_ == HSA_PROFILE_FULL) ? 1 : 0; + srd3.bits.HASH_ENABLE = 0; + srd3.bits.HEAP = 0; + srd3.bits.MTYPE__CI__VI = 0; + srd3.bits.TYPE = SQ_RSRC_BUF; + + amd_queue_.scratch_resource_descriptor[0] = srd0.u32All; + amd_queue_.scratch_resource_descriptor[1] = srd1.u32All; + amd_queue_.scratch_resource_descriptor[2] = srd2.u32All; + amd_queue_.scratch_resource_descriptor[3] = srd3.u32All; + + // Populate flat scratch parameters in amd_queue_. + amd_queue_.scratch_backing_memory_location = + queue_scratch_.queue_process_offset; + amd_queue_.scratch_backing_memory_byte_size = queue_scratch_.size; + amd_queue_.scratch_workitem_byte_size = + uint32_t(queue_scratch_.size_per_thread); + + // Set concurrent wavefront limits when scratch is being used. + COMPUTE_TMPRING_SIZE tmpring_size = {0}; + + if (queue_scratch_.size != 0) { + tmpring_size.bits.WAVES = + (queue_scratch_.size / queue_scratch_.size_per_thread / 64); + tmpring_size.bits.WAVESIZE = + (((64 * queue_scratch_.size_per_thread) + 1023) / 1024); + } + + amd_queue_.compute_tmpring_size = tmpring_size.u32All; + + // Set group and private memory apertures in amd_queue_. + auto& regions = agent->regions(); + + for (int i = 0; i < regions.size(); i++) { + const MemoryRegion* amdregion; + amdregion = static_cast(regions[i]); + uint64_t base = amdregion->GetBaseAddress(); + + if (amdregion->IsLDS()) { +#ifdef HSA_LARGE_MODEL + amd_queue_.group_segment_aperture_base_hi = + uint32_t(uintptr_t(base) >> 32); +#else + amd_queue_.group_segment_aperture_base_hi = uint32_t(base); +#endif + } + + if (amdregion->IsScratch()) { +#ifdef HSA_LARGE_MODEL + amd_queue_.private_segment_aperture_base_hi = + uint32_t(uintptr_t(base) >> 32); +#else + amd_queue_.private_segment_aperture_base_hi = uint32_t(base); +#endif + } + } + + assert(amd_queue_.group_segment_aperture_base_hi != NULL && + "No group region found."); + + if (os::GetEnvVar("HSA_CHECK_FLAT_SCRATCH") == "1") { + assert(amd_queue_.private_segment_aperture_base_hi != NULL && + "No private region found."); + } + + MAKE_NAMED_SCOPE_GUARD(EventGuard, [&]() { + ScopedAcquire _lock(&queue_lock_); + queue_count_--; + if (queue_count_ == 0) { + core::InterruptSignal::DestroyEvent(queue_event_); + queue_event_ = NULL; + } + }); + + MAKE_NAMED_SCOPE_GUARD(SignalGuard, [&]() { + HSA::hsa_signal_destroy(amd_queue_.queue_inactive_signal); + }); +#if defined(HSA_LARGE_MODEL) && defined(__linux__) + if (core::g_use_interrupt_wait) { + { + ScopedAcquire _lock(&queue_lock_); + queue_count_++; + if (queue_event_ == NULL) { + assert(queue_count_ == 1 && + "Inconsistency in queue event reference counting found.\n"); + + queue_event_ = + core::InterruptSignal::CreateEvent(HSA_EVENTTYPE_SIGNAL, false); + if (queue_event_ == NULL) return; + } + } + auto signal = new core::InterruptSignal(0, queue_event_); + amd_queue_.queue_inactive_signal = core::InterruptSignal::Convert(signal); + if (hsa_amd_signal_async_handler( + amd_queue_.queue_inactive_signal, HSA_SIGNAL_CONDITION_NE, 0, + DynamicScratchHandler, this) != HSA_STATUS_SUCCESS) + return; + } else { + EventGuard.Dismiss(); + SignalGuard.Dismiss(); + } +#else + EventGuard.Dismiss(); + SignalGuard.Dismiss(); +#endif + + valid_ = true; + active_ = 1; + + RingGuard.Dismiss(); + QueueGuard.Dismiss(); + EventGuard.Dismiss(); + SignalGuard.Dismiss(); +} + +AqlQueue::~AqlQueue() { + if (!IsValid()) { + return; + } + + if (active_ == 1) hsaKmtDestroyQueue(queue_id_); + + FreeRegisteredRingBuffer(); + agent_->ReleaseQueueScratch(queue_scratch_.queue_base); + HSA::hsa_signal_destroy(amd_queue_.queue_inactive_signal); +#if defined(HSA_LARGE_MODEL) && defined(__linux__) + if (core::g_use_interrupt_wait) { + ScopedAcquire lock(&queue_lock_); + queue_count_--; + if (queue_count_ == 0) { + core::InterruptSignal::DestroyEvent(queue_event_); + queue_event_ = NULL; + } + } +#endif +} + +uint64_t AqlQueue::LoadReadIndexAcquire() { + return atomic::Load(&amd_queue_.read_dispatch_id, std::memory_order_acquire); +} + +uint64_t AqlQueue::LoadReadIndexRelaxed() { + return atomic::Load(&amd_queue_.read_dispatch_id, std::memory_order_relaxed); +} + +uint64_t AqlQueue::LoadWriteIndexAcquire() { + return atomic::Load(&amd_queue_.write_dispatch_id, std::memory_order_acquire); +} + +uint64_t AqlQueue::LoadWriteIndexRelaxed() { + return atomic::Load(&amd_queue_.write_dispatch_id, std::memory_order_relaxed); +} + +void AqlQueue::StoreWriteIndexRelaxed(uint64_t value) { + atomic::Store(&amd_queue_.write_dispatch_id, value, + std::memory_order_relaxed); +} + +void AqlQueue::StoreWriteIndexRelease(uint64_t value) { + atomic::Store(&amd_queue_.write_dispatch_id, value, + std::memory_order_release); +} + +uint64_t AqlQueue::CasWriteIndexAcqRel(uint64_t expected, uint64_t value) { + return atomic::Cas(&amd_queue_.write_dispatch_id, value, expected, + std::memory_order_acq_rel); +} +uint64_t AqlQueue::CasWriteIndexAcquire(uint64_t expected, uint64_t value) { + return atomic::Cas(&amd_queue_.write_dispatch_id, value, expected, + std::memory_order_acquire); +} +uint64_t AqlQueue::CasWriteIndexRelaxed(uint64_t expected, uint64_t value) { + return atomic::Cas(&amd_queue_.write_dispatch_id, value, expected, + std::memory_order_relaxed); +} +uint64_t AqlQueue::CasWriteIndexRelease(uint64_t expected, uint64_t value) { + return atomic::Cas(&amd_queue_.write_dispatch_id, value, expected, + std::memory_order_release); +} + +uint64_t AqlQueue::AddWriteIndexAcqRel(uint64_t value) { + return atomic::Add(&amd_queue_.write_dispatch_id, value, + std::memory_order_acq_rel); +} + +uint64_t AqlQueue::AddWriteIndexAcquire(uint64_t value) { + return atomic::Add(&amd_queue_.write_dispatch_id, value, + std::memory_order_acquire); +} + +uint64_t AqlQueue::AddWriteIndexRelaxed(uint64_t value) { + return atomic::Add(&amd_queue_.write_dispatch_id, value, + std::memory_order_relaxed); +} + +uint64_t AqlQueue::AddWriteIndexRelease(uint64_t value) { + return atomic::Add(&amd_queue_.write_dispatch_id, value, + std::memory_order_release); +} + +void AqlQueue::StoreRelaxed(hsa_signal_value_t value) { + // Acquire spinlock protecting the legacy doorbell. + while (atomic::Cas(&amd_queue_.legacy_doorbell_lock, 1U, 0U, + std::memory_order_acquire) != 0) { + os::YieldThread(); + } + +#ifdef HSA_LARGE_MODEL + // AMD hardware convention expects the packet index to point beyond + // the last packet to be processed. Packet indices written to the + // max_legacy_doorbell_dispatch_id_plus_1 field must conform to this + // expectation, since this field is used as the HW-visible write index. + uint64_t legacy_dispatch_id = value + 1; +#else + // In the small machine model it is difficult to distinguish packet index + // wrap at 2^32 packets from a backwards doorbell. Instead, ignore the + // doorbell value and submit the write index instead. It is OK to issue + // a doorbell for packets in the INVALID or ALWAYS_RESERVED state. + // The HW will stall on these packets until they enter a valid state. + uint64_t legacy_dispatch_id = amd_queue_.write_dispatch_id; + + // The write index may extend more than a full queue of packets beyond + // the read index. The hardware can process at most a full queue of packets + // at a time. Clamp the write index appropriately. A doorbell for the + // remaining packets is guaranteed to be sent at a later time. + legacy_dispatch_id = + Min(legacy_dispatch_id, + uint64_t(amd_queue_.read_dispatch_id) + amd_queue_.hsa_queue.size); +#endif + + // Discard backwards and duplicate doorbells. + if (legacy_dispatch_id > amd_queue_.max_legacy_doorbell_dispatch_id_plus_1) { + // Record the most recent packet index used in a doorbell submission. + // This field will be interpreted as a write index upon HW queue connect. + // Must be visible to the HW before sending the doorbell to avoid a race. + atomic::Store(&amd_queue_.max_legacy_doorbell_dispatch_id_plus_1, + legacy_dispatch_id, std::memory_order_relaxed); + + // Write the dispatch id to the hardware MMIO doorbell. + if (doorbell_type_ == 0) { + // The legacy GFXIP 7 hardware doorbell expects: + // 1. Packet index wrapped to a point within the ring buffer + // 2. Packet index converted to DWORD count + uint64_t queue_size_mask = + ((1 + queue_full_workaround_) * amd_queue_.hsa_queue.size) - 1; + + *(volatile uint32_t*)signal_.legacy_hardware_doorbell_ptr = + uint32_t((legacy_dispatch_id & queue_size_mask) * + (sizeof(core::AqlPacket) / sizeof(uint32_t))); + } else if (doorbell_type_ == 1) { + *(volatile uint32_t*)signal_.legacy_hardware_doorbell_ptr = + uint32_t(legacy_dispatch_id); + } else { + assert(false && "Agent has unsupported doorbell semantics"); + } + } + + // Release spinlock protecting the legacy doorbell. + atomic::Store(&amd_queue_.legacy_doorbell_lock, 0U, + std::memory_order_release); +} + +void AqlQueue::StoreRelease(hsa_signal_value_t value) { + std::atomic_thread_fence(std::memory_order_release); + StoreRelaxed(value); +} + +uint32_t AqlQueue::ComputeRingBufferMinPkts() { + // From CP_HQD_PQ_CONTROL.QUEUE_SIZE specification: + // Size of the primary queue (PQ) will be: 2^(HQD_QUEUE_SIZE+1) DWs. + // Min Size is 7 (2^8 = 256 DWs) and max size is 29 (2^30 = 1 G-DW) + uint32_t min_bytes = 0x400; + + if (queue_full_workaround_ == 1) { +#ifdef __linux__ + // Double mapping requires one page of backing store. + min_bytes = Max(min_bytes, 0x1000U); +#endif +#ifdef _WIN32 + // Shared memory mapping is at system allocation granularity. + SYSTEM_INFO sys_info; + GetNativeSystemInfo(&sys_info); + min_bytes = Max(min_bytes, uint32_t(sys_info.dwAllocationGranularity)); +#endif + } + + return uint32_t(min_bytes / sizeof(core::AqlPacket)); +} + +uint32_t AqlQueue::ComputeRingBufferMaxPkts() { + // From CP_HQD_PQ_CONTROL.QUEUE_SIZE specification: + // Size of the primary queue (PQ) will be: 2^(HQD_QUEUE_SIZE+1) DWs. + // Min Size is 7 (2^8 = 256 DWs) and max size is 29 (2^30 = 1 G-DW) + uint64_t max_bytes = 0x100000000; + + if (queue_full_workaround_ == 1) { + // Double mapping halves maximum size. + max_bytes /= 2; + } + + return uint32_t(max_bytes / sizeof(core::AqlPacket)); +} + +void AqlQueue::AllocRegisteredRingBuffer(uint32_t queue_size_pkts) { + if (agent_profile_ == HSA_PROFILE_FULL) { + // Compute the physical and virtual size of the queue. + uint32_t ring_buf_phys_size_bytes = + uint32_t(queue_size_pkts * sizeof(core::AqlPacket)); + ring_buf_alloc_bytes_ = 2 * ring_buf_phys_size_bytes; + +#ifdef __linux__ + // Create a system-unique shared memory path for this thread. + char ring_buf_shm_path[16]; + pid_t sys_unique_tid = pid_t(syscall(__NR_gettid)); + sprintf(ring_buf_shm_path, "/%u", sys_unique_tid); + + int ring_buf_shm_fd = -1; + void* reserve_va = NULL; + + do { + // Create a shared memory object to back the ring buffer. + ring_buf_shm_fd = shm_open(ring_buf_shm_path, O_CREAT | O_RDWR | O_EXCL, + S_IRUSR | S_IWUSR); + if (ring_buf_shm_fd == -1) { + break; + } + if (posix_fallocate(ring_buf_shm_fd, 0, ring_buf_phys_size_bytes) != 0) + break; + + // Reserve a VA range twice the size of the physical backing store. + reserve_va = mmap(NULL, ring_buf_alloc_bytes_, PROT_NONE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + assert(reserve_va != MAP_FAILED && "mmap failed"); + + // Remap the lower and upper halves of the VA range. + // Map both halves to the shared memory backing store. + // If the GPU device is KV, do not set PROT_EXEC flag. + void* ring_buf_lower_half = NULL; + void* ring_buf_upper_half = NULL; + if (is_kv_queue_) { + ring_buf_lower_half = + mmap(reserve_va, ring_buf_phys_size_bytes, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_FIXED, ring_buf_shm_fd, 0); + assert(ring_buf_lower_half != MAP_FAILED && "mmap failed"); + + ring_buf_upper_half = + mmap((void*)(uintptr_t(reserve_va) + ring_buf_phys_size_bytes), + ring_buf_phys_size_bytes, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_FIXED, ring_buf_shm_fd, 0); + assert(ring_buf_upper_half != MAP_FAILED && "mmap failed"); + } else { + ring_buf_lower_half = mmap(reserve_va, ring_buf_phys_size_bytes, + PROT_READ | PROT_WRITE | PROT_EXEC, + MAP_SHARED | MAP_FIXED, ring_buf_shm_fd, 0); + assert(ring_buf_lower_half != MAP_FAILED && "mmap failed"); + + ring_buf_upper_half = + mmap((void*)(uintptr_t(reserve_va) + ring_buf_phys_size_bytes), + ring_buf_phys_size_bytes, PROT_READ | PROT_WRITE | PROT_EXEC, + MAP_SHARED | MAP_FIXED, ring_buf_shm_fd, 0); + assert(ring_buf_upper_half != MAP_FAILED && "mmap failed"); + } + + // Release explicit reference to shared memory object. + shm_unlink(ring_buf_shm_path); + close(ring_buf_shm_fd); + + // Successfully created mapping. + ring_buf_ = ring_buf_lower_half; + return; + } while (false); + + // Resource cleanup on failure. + if (reserve_va) munmap(reserve_va, ring_buf_alloc_bytes_); + if (ring_buf_shm_fd != -1) { + shm_unlink(ring_buf_shm_path); + close(ring_buf_shm_fd); + } +#endif +#ifdef _WIN32 + HANDLE ring_buf_mapping = INVALID_HANDLE_VALUE; + void* ring_buf_lower_half = NULL; + void* ring_buf_upper_half = NULL; + + do { + // Create a page file mapping to back the ring buffer. + ring_buf_mapping = CreateFileMapping(INVALID_HANDLE_VALUE, NULL, + PAGE_EXECUTE_READWRITE | SEC_COMMIT, + 0, ring_buf_phys_size_bytes, NULL); + if (ring_buf_mapping == NULL) { + break; + } + + // Retry until obtaining an appropriate virtual address mapping. + for (int num_attempts = 0; num_attempts < 1000; ++num_attempts) { + // Find a virtual address range twice the size of the file mapping. + void* reserve_va = + VirtualAllocEx(GetCurrentProcess(), NULL, ring_buf_alloc_bytes_, + MEM_TOP_DOWN | MEM_RESERVE, PAGE_EXECUTE_READWRITE); + if (reserve_va == NULL) { + break; + } + VirtualFree(reserve_va, 0, MEM_RELEASE); + + // Map the ring buffer into the free virtual range. + // This may fail: another thread can allocate in this range. + ring_buf_lower_half = MapViewOfFileEx( + ring_buf_mapping, FILE_MAP_ALL_ACCESS | FILE_MAP_EXECUTE, 0, 0, + ring_buf_phys_size_bytes, reserve_va); + + if (ring_buf_lower_half == NULL) { + // Virtual range allocated by another thread, try again. + continue; + } + + ring_buf_upper_half = MapViewOfFileEx( + ring_buf_mapping, FILE_MAP_ALL_ACCESS | FILE_MAP_EXECUTE, 0, 0, + ring_buf_phys_size_bytes, + (void*)(uintptr_t(reserve_va) + ring_buf_phys_size_bytes)); + + if (ring_buf_upper_half == NULL) { + // Virtual range allocated by another thread, try again. + UnmapViewOfFile(ring_buf_lower_half); + continue; + } + + // Successfully created mapping. + ring_buf_ = ring_buf_lower_half; + break; + } + + if (ring_buf_ == NULL) { + break; + } + + // Release file mapping (reference counted by views). + CloseHandle(ring_buf_mapping); + + // Don't register the memory: causes a failure in the KFD. + // Instead use implicit registration to access the ring buffer. + return; + } while (false); + + // Resource cleanup on failure. + UnmapViewOfFile(ring_buf_upper_half); + UnmapViewOfFile(ring_buf_lower_half); + CloseHandle(ring_buf_mapping); +#endif + } else { + // Allocate storage for the ring buffer. + HsaMemFlags flags; + flags.Value = 0; + flags.ui32.HostAccess = 1; + flags.ui32.AtomicAccessPartial = 1; + flags.ui32.ExecuteAccess = 1; + flags.ui32.AQLQueueMemory = 1; + + ring_buf_alloc_bytes_ = AlignUp( + queue_size_pkts * static_cast(sizeof(core::AqlPacket)), 4096); + auto err = hsaKmtAllocMemory(agent_->node_id(), ring_buf_alloc_bytes_, + flags, (void**)&ring_buf_); + + if (err != HSAKMT_STATUS_SUCCESS) { + assert(false && "AQL queue memory allocation failure."); + return; + } + + HSAuint64 alternate_va; + err = hsaKmtMapMemoryToGPU(ring_buf_, ring_buf_alloc_bytes_, &alternate_va); + + if (err != HSAKMT_STATUS_SUCCESS) { + assert(false && "AQL queue memory map failure."); + hsaKmtFreeMemory(ring_buf_, ring_buf_alloc_bytes_); + ring_buf_ = NULL; + return; + } + + ring_buf_alloc_bytes_ = 2 * ring_buf_alloc_bytes_; + } +} + +void AqlQueue::FreeRegisteredRingBuffer() { + if (agent_profile_ == HSA_PROFILE_FULL) { +#ifdef __linux__ + munmap(ring_buf_, ring_buf_alloc_bytes_); +#endif +#ifdef _WIN32 + UnmapViewOfFile(ring_buf_); + UnmapViewOfFile( + (void*)(uintptr_t(ring_buf_) + (ring_buf_alloc_bytes_ / 2))); +#endif + } else { + hsaKmtUnmapMemoryToGPU(ring_buf_); + hsaKmtFreeMemory(ring_buf_, ring_buf_alloc_bytes_ / 2); + } + + ring_buf_ = NULL; + ring_buf_alloc_bytes_ = 0; +} + +hsa_status_t AqlQueue::Inactivate() { + int32_t active = atomic::Exchange((volatile int32_t*)&active_, 0); + if (active == 1) hsaKmtDestroyQueue(this->queue_id_); + return HSA_STATUS_SUCCESS; +} + +bool AqlQueue::DynamicScratchHandler(hsa_signal_value_t error_code, void* arg) { + AqlQueue* queue = (AqlQueue*)arg; + + if ((error_code & 1) == 1) { + // Insufficient scratch - recoverable + auto& scratch = queue->queue_scratch_; + + queue->agent_->ReleaseQueueScratch(scratch.queue_base); + + const core::AqlPacket& pkt = + ((core::AqlPacket*)queue->amd_queue_.hsa_queue + .base_address)[queue->amd_queue_.read_dispatch_id]; + + uint32_t scratch_request = pkt.dispatch.private_segment_size; + + scratch.size_per_thread = + Max(uint32_t(scratch.size_per_thread * 2), scratch_request); + // Align whole waves to 1KB. + scratch.size_per_thread = AlignUp(scratch.size_per_thread, 16); + scratch.size = scratch.size_per_thread * (queue->amd_queue_.max_cu_id + 1) * + 32 * 64; // TODO: replace constants. + + // printf("Growing scratch to %u - %u\n", uint32_t(scratch.size_per_thread), + // uint32_t(scratch.size)); + + queue->agent_->AcquireQueueScratch(scratch); + if (scratch.queue_base == NULL) { + // Out of scratch - promote error and invalidate queue + queue->Inactivate(); + if (queue->errors_callback_ != NULL) + queue->errors_callback_(HSA_STATUS_ERROR_OUT_OF_RESOURCES, + queue->public_handle(), queue->errors_data_); + return false; + } + + SQ_BUF_RSRC_WORD0 srd0; + SQ_BUF_RSRC_WORD2 srd2; + uintptr_t base = (uintptr_t)scratch.queue_base; + + srd0.u32All = queue->amd_queue_.scratch_resource_descriptor[0]; + srd2.u32All = queue->amd_queue_.scratch_resource_descriptor[2]; + + srd0.bits.BASE_ADDRESS = uint32_t(base); + srd2.bits.NUM_RECORDS = uint32_t(scratch.size); + + queue->amd_queue_.scratch_resource_descriptor[0] = srd0.u32All; + queue->amd_queue_.scratch_resource_descriptor[2] = srd2.u32All; + +#ifdef HSA_LARGE_MODEL + SQ_BUF_RSRC_WORD1 srd1; + srd1.u32All = queue->amd_queue_.scratch_resource_descriptor[1]; + srd1.bits.BASE_ADDRESS_HI = uint32_t(base >> 32); + queue->amd_queue_.scratch_resource_descriptor[1] = srd1.u32All; +#endif + + queue->amd_queue_.scratch_backing_memory_location = + scratch.queue_process_offset; + queue->amd_queue_.scratch_backing_memory_byte_size = scratch.size; + queue->amd_queue_.scratch_workitem_byte_size = + uint32_t(scratch.size_per_thread); + + COMPUTE_TMPRING_SIZE tmpring_size = {0}; + tmpring_size.bits.WAVES = (scratch.size / scratch.size_per_thread / 64); + tmpring_size.bits.WAVESIZE = + (((64 * scratch.size_per_thread) + 1023) / 1024); + queue->amd_queue_.compute_tmpring_size = tmpring_size.u32All; + + } else if ((error_code & 2) == 2) { // Invalid dim + queue->Inactivate(); + if (queue->errors_callback_ != NULL) + queue->errors_callback_(HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS, + queue->public_handle(), queue->errors_data_); + return false; + + } else if ((error_code & 4) == 4) { // Invalid group memory + queue->Inactivate(); + if (queue->errors_callback_ != NULL) + queue->errors_callback_(HSA_STATUS_ERROR_INVALID_ALLOCATION, + queue->public_handle(), queue->errors_data_); + return false; + + } else if ((error_code & 8) == 8) { // Invalid (or NULL) code + queue->Inactivate(); + if (queue->errors_callback_ != NULL) + queue->errors_callback_(HSA_STATUS_ERROR_INVALID_CODE_OBJECT, + queue->public_handle(), queue->errors_data_); + return false; + + } else if ((error_code & 32) == 32) { // Invalid format + queue->Inactivate(); + if (queue->errors_callback_ != NULL) + queue->errors_callback_(HSA_STATUS_ERROR_INVALID_PACKET_FORMAT, + queue->public_handle(), queue->errors_data_); + return false; + } else if ((error_code & 64) == 64) { // Group is too large + queue->Inactivate(); + if (queue->errors_callback_ != NULL) + queue->errors_callback_(HSA_STATUS_ERROR_INVALID_ARGUMENT, + queue->public_handle(), queue->errors_data_); + return false; + } else if ((error_code & 128) == 128) { // Out of VGPRs + queue->Inactivate(); + if (queue->errors_callback_ != NULL) + queue->errors_callback_(HSA_STATUS_ERROR_INVALID_ISA, + queue->public_handle(), queue->errors_data_); + return false; + } else if ((error_code & 0x80000000) == 0x80000000) { // Debug trap + queue->Inactivate(); + if (queue->errors_callback_ != NULL) + queue->errors_callback_(HSA_STATUS_ERROR_EXCEPTION, + queue->public_handle(), queue->errors_data_); + return false; + } else { + // Undefined code + queue->Inactivate(); + assert(false && "Undefined queue error code"); + if (queue->errors_callback_ != NULL) + queue->errors_callback_(HSA_STATUS_ERROR, queue->public_handle(), + queue->errors_data_); + return false; + } + + HSA::hsa_signal_store_relaxed(queue->amd_queue_.queue_inactive_signal, 0); + return true; +} + +hsa_status_t AqlQueue::SetCUMasking(const uint32_t num_cu_mask_count, + const uint32_t* cu_mask) { + HSAKMT_STATUS ret = hsaKmtSetQueueCUMask( + queue_id_, num_cu_mask_count, + reinterpret_cast(const_cast(cu_mask))); + return (HSAKMT_STATUS_SUCCESS == ret) ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR; +} +} // namespace amd diff --git a/runtime/hsa-runtime/core/runtime/amd_blit_kernel.cpp b/runtime/hsa-runtime/core/runtime/amd_blit_kernel.cpp new file mode 100644 index 0000000000..f654ea86e6 --- /dev/null +++ b/runtime/hsa-runtime/core/runtime/amd_blit_kernel.cpp @@ -0,0 +1,647 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#include "core/inc/amd_blit_kernel.h" + +#include +#include +#include +#include + +#if defined(_WIN32) || defined(_WIN64) +#define NOMINMAX +#include +#else +#include +#endif + +#include "core/inc/amd_blit_kernel_kv.h" +#include "core/inc/amd_blit_kernel_vi.h" +#include "core/inc/amd_gpu_agent.h" +#include "core/inc/hsa_internal.h" +#include "core/util/utils.h" + +namespace amd { +const uint32_t BlitKernel::kGroupSize = 256; +const size_t BlitKernel::kMaxCopyCount = AlignDown(UINT32_MAX, kGroupSize); +const size_t BlitKernel::kMaxFillCount = AlignDown(UINT32_MAX, kGroupSize); + +static const uint16_t kInvalidPacketHeader = HSA_PACKET_TYPE_INVALID; + +BlitKernel::BlitKernel() + : core::Blit(), + copy_code_handle_(0), + fill_code_handle_(0), + queue_(NULL), + cached_index_(0), + kernarg_(NULL), + kernarg_async_(NULL), + kernarg_async_mask_(0), + kernarg_async_counter_(0), + code_arg_buffer_(NULL) { + completion_signal_.handle = 0; +} + +BlitKernel::~BlitKernel() {} + +hsa_status_t BlitKernel::Initialize(const core::Agent& agent) { + hsa_agent_t agent_handle = agent.public_handle(); + + uint32_t features = 0; + hsa_status_t status = + HSA::hsa_agent_get_info(agent_handle, HSA_AGENT_INFO_FEATURE, &features); + if (status != HSA_STATUS_SUCCESS) { + return status; + } + + if ((features & HSA_AGENT_FEATURE_KERNEL_DISPATCH) == 0) { + return HSA_STATUS_ERROR; + } + + // Need queue buffer that can cover the max size of local memory. + const uint64_t kGpuVmVaSize = 1ULL << 40; + const uint32_t kRequiredQueueSize = NextPow2(static_cast( + std::ceil(static_cast(kGpuVmVaSize) / kMaxCopyCount))); + + uint32_t max_queue_size = 0; + status = HSA::hsa_agent_get_info(agent_handle, HSA_AGENT_INFO_QUEUE_MAX_SIZE, + &max_queue_size); + + if (HSA_STATUS_SUCCESS != status) { + return status; + } + + if (max_queue_size < kRequiredQueueSize) { + return HSA_STATUS_ERROR_OUT_OF_RESOURCES; + } + + status = + HSA::hsa_queue_create(agent_handle, kRequiredQueueSize, + HSA_QUEUE_TYPE_MULTI, NULL, NULL, 0, 0, &queue_); + + if (HSA_STATUS_SUCCESS != status) { + return status; + } + + queue_bitmask_ = queue_->size - 1; + + cached_index_ = 0; + + void* copy_raw_obj_mem = NULL; + size_t copy_akc_size = 0; + size_t copy_akc_offset = 0; + + void* copy_aligned_raw_obj_mem = NULL; + size_t copy_aligned_akc_size = 0; + size_t copy_aligned_akc_offset = 0; + + void* fill_raw_obj_mem = NULL; + size_t fill_akc_size = 0; + size_t fill_akc_offset = 0; + + switch (agent.isa()->GetMajorVersion()) { + case 7: + copy_raw_obj_mem = kVectorCopyKvObject; + copy_akc_size = HSA_VECTOR_COPY_KV_AKC_SIZE; + copy_akc_offset = HSA_VECTOR_COPY_KV_AKC_OFFSET; + + copy_aligned_raw_obj_mem = kVectorCopyAlignedKvObject; + copy_aligned_akc_size = HSA_VECTOR_COPY_ALIGNED_KV_AKC_SIZE; + copy_aligned_akc_offset = HSA_VECTOR_COPY_ALIGNED_KV_AKC_OFFSET; + + fill_raw_obj_mem = kFillMemoryKvObject; + fill_akc_size = HSA_FILL_MEMORY_KV_AKC_SIZE; + fill_akc_offset = HSA_FILL_MEMORY_KV_AKC_OFFSET; + break; + case 8: + copy_raw_obj_mem = kVectorCopyViObject; + copy_akc_size = HSA_VECTOR_COPY_VI_AKC_SIZE; + copy_akc_offset = HSA_VECTOR_COPY_VI_AKC_OFFSET; + + copy_aligned_raw_obj_mem = kVectorCopyAlignedViObject; + copy_aligned_akc_size = HSA_VECTOR_COPY_ALIGNED_VI_AKC_SIZE; + copy_aligned_akc_offset = HSA_VECTOR_COPY_ALIGNED_VI_AKC_OFFSET; + + fill_raw_obj_mem = kFillMemoryViObject; + fill_akc_size = HSA_FILL_MEMORY_VI_AKC_SIZE; + fill_akc_offset = HSA_FILL_MEMORY_VI_AKC_OFFSET; + break; + default: + assert(false && "Only gfx7 and gfx8 are supported"); + break; + } + + static const size_t kKernArgSize = + std::max(sizeof(KernelCopyArgs), sizeof(KernelFillArgs)); + const size_t total_alloc_size = AlignUp( + AlignUp(copy_akc_size, 256) + AlignUp(copy_aligned_akc_size, 256) + + AlignUp(fill_akc_size, 256) + AlignUp(kKernArgSize, 16), + 4096); + + amd_kernel_code_t *code_ptr = nullptr; + code_arg_buffer_ = core::Runtime::runtime_singleton_->system_allocator()( + total_alloc_size, 4096); + + char* akc_arg = reinterpret_cast(code_arg_buffer_); + memcpy(akc_arg, + reinterpret_cast(copy_raw_obj_mem) + copy_akc_offset, + copy_akc_size); + copy_code_handle_ = reinterpret_cast(akc_arg); + code_ptr = (amd_kernel_code_t*)(copy_code_handle_); + code_ptr->runtime_loader_kernel_symbol = 0; + akc_arg += copy_akc_size; + + akc_arg = AlignUp(akc_arg, 256); + memcpy(akc_arg, reinterpret_cast(copy_aligned_raw_obj_mem) + + copy_aligned_akc_offset, + copy_aligned_akc_size); + copy_aligned_code_handle_ = reinterpret_cast(akc_arg); + code_ptr = (amd_kernel_code_t*)(copy_aligned_code_handle_); + code_ptr->runtime_loader_kernel_symbol = 0; + akc_arg += copy_aligned_akc_size; + + akc_arg = AlignUp(akc_arg, 256); + memcpy(akc_arg, + reinterpret_cast(fill_raw_obj_mem) + fill_akc_offset, + fill_akc_size); + fill_code_handle_ = reinterpret_cast(akc_arg); + code_ptr = (amd_kernel_code_t*)(fill_code_handle_); + code_ptr->runtime_loader_kernel_symbol = 0; + akc_arg += fill_akc_size; + + akc_arg = AlignUp(akc_arg, 16); + kernarg_ = akc_arg; + + status = HSA::hsa_signal_create(1, 0, NULL, &completion_signal_); + if (HSA_STATUS_SUCCESS != status) { + return status; + } + + kernarg_async_ = reinterpret_cast( + core::Runtime::runtime_singleton_->system_allocator()( + kRequiredQueueSize * AlignUp(sizeof(KernelCopyArgs), 16), 16)); + + kernarg_async_mask_ = kRequiredQueueSize - 1; + + // TODO(bwicakso): remove this code when execute permission level is not + // mandatory. + if (((amd::GpuAgent&)agent).profile() == HSA_PROFILE_FULL) { +#if defined(_WIN32) || defined(_WIN64) +#define NOMINMAX + DWORD old_protect = 0; + const DWORD new_protect = PAGE_EXECUTE_READWRITE; + if (!VirtualProtect(code_arg_buffer_, total_alloc_size, new_protect, + &old_protect)) { + return HSA_STATUS_ERROR_OUT_OF_RESOURCES; + } +#else + if (0 != mprotect(code_arg_buffer_, total_alloc_size, + PROT_READ | PROT_WRITE | PROT_EXEC)) { + return HSA_STATUS_ERROR_OUT_OF_RESOURCES; + } +#endif + } + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t BlitKernel::Destroy(void) { + std::lock_guard guard(lock_); + + if (queue_ != NULL) { + HSA::hsa_queue_destroy(queue_); + } + + if (kernarg_async_ != NULL) { + core::Runtime::runtime_singleton_->system_deallocator()(kernarg_async_); + } + + if (code_arg_buffer_ != NULL) { + core::Runtime::runtime_singleton_->system_deallocator()(code_arg_buffer_); + } + + if (completion_signal_.handle != 0) { + HSA::hsa_signal_destroy(completion_signal_); + } + + return HSA_STATUS_SUCCESS; +} + +static bool IsSystemMemory(void* address) { + static const uint64_t kLimitSystem = 1ULL << 48; + return (reinterpret_cast(address) < kLimitSystem); +} + +hsa_status_t BlitKernel::SubmitLinearCopyCommand(void* dst, const void* src, + size_t size) { + assert(copy_code_handle_ != 0); + + std::lock_guard guard(lock_); + + HSA::hsa_signal_store_relaxed(completion_signal_, 1); + + const size_t kAlignmentChar = 1; + const size_t kAlignmentUin32 = 4; + const size_t kAlignmentVec4 = 16; + const size_t copy_granule = + (IsMultipleOf(dst, kAlignmentVec4) && IsMultipleOf(src, kAlignmentVec4) && + IsMultipleOf(size, kAlignmentVec4)) + ? kAlignmentVec4 + : (IsMultipleOf(dst, kAlignmentUin32) && + IsMultipleOf(src, kAlignmentUin32) && + IsMultipleOf(size, kAlignmentUin32)) + ? kAlignmentUin32 + : kAlignmentChar; + + size = size / copy_granule; + + const uint32_t num_copy_packet = static_cast( + std::ceil(static_cast(size) / kMaxCopyCount)); + + // Reserve write index for copy + fence packet. + uint64_t write_index = AcquireWriteIndex(num_copy_packet); + + const uint32_t last_copy_index = num_copy_packet - 1; + size_t total_copy_count = 0; + for (uint32_t i = 0; i < num_copy_packet; ++i) { + // Setup arguments. + const uint32_t copy_count = static_cast( + std::min((size - total_copy_count), kMaxCopyCount)); + + void* cur_dst = static_cast(dst) + (total_copy_count * copy_granule); + const void* cur_src = + static_cast(src) + (total_copy_count * copy_granule); + + KernelCopyArgs* args = ObtainAsyncKernelCopyArg(); + assert(args != NULL); + assert(IsMultipleOf(args, 16)); + + args->src = cur_src; + args->dst = cur_dst; + args->size = copy_count; + args->use_vector = (copy_granule == kAlignmentVec4) ? 1 : 0; + + const uint32_t grid_size_x = + AlignUp(static_cast(copy_count), kGroupSize); + + // This assert to make sure kMaxCopySize is not changed to a number that + // could cause overflow to packet.grid_size_x. + assert(grid_size_x >= copy_count); + + hsa_signal_t signal = {(i == last_copy_index) ? completion_signal_.handle + : 0}; + PopulateQueue(write_index + i, ((copy_granule == kAlignmentChar) + ? copy_code_handle_ + : copy_aligned_code_handle_), + args, grid_size_x, signal); + + total_copy_count += copy_count; + } + + // Launch copy packet. + ReleaseWriteIndex(write_index, num_copy_packet); + + // Wait for the packet to finish. + if (HSA::hsa_signal_wait_acquire(completion_signal_, HSA_SIGNAL_CONDITION_LT, + 1, uint64_t(-1), + HSA_WAIT_STATE_ACTIVE) != 0) { + // Signal wait returned unexpected value. + return HSA_STATUS_ERROR; + } + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t BlitKernel::SubmitLinearCopyCommand( + void* dst, const void* src, size_t size, + std::vector& dep_signals, core::Signal& out_signal) { + (copy_code_handle_ != 0); + const size_t kAlignmentChar = 1; + const size_t kAlignmentUin32 = 4; + const size_t kAlignmentVec4 = 16; + const size_t copy_granule = + (IsMultipleOf(dst, kAlignmentVec4) && IsMultipleOf(src, kAlignmentVec4) && + IsMultipleOf(size, kAlignmentVec4)) + ? kAlignmentVec4 + : (IsMultipleOf(dst, kAlignmentUin32) && + IsMultipleOf(src, kAlignmentUin32) && + IsMultipleOf(size, kAlignmentUin32)) + ? kAlignmentUin32 + : kAlignmentChar; + + size = size / copy_granule; + + const uint32_t num_copy_packet = static_cast( + std::ceil(static_cast(size) / kMaxCopyCount)); + + const uint32_t num_barrier_packet = + static_cast(std::ceil(dep_signals.size() / 5.0f)); + + // Reserve write index for copy + fence packet. + const uint32_t total_num_packet = num_barrier_packet + num_copy_packet; + + uint64_t write_index = AcquireWriteIndex(total_num_packet); + uint64_t write_index_temp = write_index; + + const uint16_t kBarrierPacketHeader = + (HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE) | + (1 << HSA_PACKET_HEADER_BARRIER) | + (HSA_FENCE_SCOPE_NONE << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) | + (HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE); + + hsa_barrier_and_packet_t barrier_packet = {0}; + barrier_packet.header = HSA_PACKET_TYPE_INVALID; + + hsa_barrier_and_packet_t* queue_buffer = + reinterpret_cast(queue_->base_address); + + const size_t dep_signal_count = dep_signals.size(); + for (size_t i = 0; i < dep_signal_count; ++i) { + const size_t idx = i % 5; + barrier_packet.dep_signal[idx] = core::Signal::Convert(dep_signals[i]); + if (i == (dep_signal_count - 1) || idx == 4) { + std::atomic_thread_fence(std::memory_order_acquire); + queue_buffer[(write_index)&queue_bitmask_] = barrier_packet; + std::atomic_thread_fence(std::memory_order_release); + queue_buffer[(write_index)&queue_bitmask_].header = kBarrierPacketHeader; + + ++write_index; + + memset(&barrier_packet, 0, sizeof(hsa_barrier_and_packet_t)); + barrier_packet.header = HSA_PACKET_TYPE_INVALID; + } + } + + const uint32_t last_copy_index = num_copy_packet - 1; + size_t total_copy_count = 0; + for (uint32_t i = 0; i < num_copy_packet; ++i) { + // Setup arguments. + const uint32_t copy_count = static_cast( + std::min((size - total_copy_count), kMaxCopyCount)); + + void* cur_dst = static_cast(dst) + (total_copy_count * copy_granule); + const void* cur_src = + static_cast(src) + (total_copy_count * copy_granule); + + KernelCopyArgs* args = ObtainAsyncKernelCopyArg(); + assert(args != NULL); + assert(IsMultipleOf(args, 16)); + + args->src = cur_src; + args->dst = cur_dst; + args->size = copy_count; + args->use_vector = (copy_granule == kAlignmentVec4) ? 1 : 0; + + const uint32_t grid_size_x = + AlignUp(static_cast(copy_count), kGroupSize); + + // This assert to make sure kMaxCopySize is not changed to a number that + // could cause overflow to packet.grid_size_x. + assert(grid_size_x >= copy_count); + + hsa_signal_t signal = {(i == last_copy_index) + ? (core::Signal::Convert(&out_signal)).handle + : 0}; + PopulateQueue(write_index, ((copy_granule == kAlignmentChar) + ? copy_code_handle_ + : copy_aligned_code_handle_), + args, grid_size_x, signal); + + ++write_index; + + total_copy_count += copy_count; + } + + // Launch copy packet. + ReleaseWriteIndex(write_index_temp, total_num_packet); + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t BlitKernel::SubmitLinearFillCommand(void* ptr, uint32_t value, + size_t num) { + assert(fill_code_handle_ != 0); + + std::lock_guard guard(lock_); + + HSA::hsa_signal_store_relaxed(completion_signal_, 1); + + const uint32_t num_fill_packet = static_cast( + std::ceil(static_cast(num) / kMaxFillCount)); + + // Reserve write index for copy + fence packet. + uint64_t write_index = AcquireWriteIndex(num_fill_packet); + + KernelFillArgs* args = reinterpret_cast(kernarg_); + + if (args == NULL) { + return HSA_STATUS_ERROR_OUT_OF_RESOURCES; + } + + const uint32_t last_fill_index = num_fill_packet - 1; + size_t total_fill_count = 0; + for (uint32_t i = 0; i < num_fill_packet; ++i) { + assert(IsMultipleOf(&args[i], 16)); + + // Setup arguments. + const uint32_t fill_count = static_cast( + std::min((num - total_fill_count), kMaxFillCount)); + void* cur_ptr = static_cast(ptr) + total_fill_count; + + args[i].ptr = cur_ptr; + args[i].num = fill_count; + args[i].value = value; + + const uint32_t grid_size_x = + AlignUp(static_cast(fill_count), kGroupSize); + + // This assert to make sure kMaxFillCount is not changed to a number that + // could cause overflow to packet.grid_size_x. + assert(grid_size_x >= fill_count); + + hsa_signal_t signal = {(i == last_fill_index) ? completion_signal_.handle + : 0}; + PopulateQueue(write_index + i, fill_code_handle_, &args[i], grid_size_x, + signal); + + total_fill_count += fill_count; + } + + // Launch fill packet. + // Launch copy packet. + ReleaseWriteIndex(write_index, num_fill_packet); + + // Wait for the packet to finish. + if (HSA::hsa_signal_wait_acquire(completion_signal_, HSA_SIGNAL_CONDITION_LT, + 1, uint64_t(-1), + HSA_WAIT_STATE_ACTIVE) != 0) { + // Signal wait returned unexpected value. + return HSA_STATUS_ERROR; + } + + return HSA_STATUS_SUCCESS; +} + +uint64_t BlitKernel::AcquireWriteIndex(uint32_t num_packet) { + assert(queue_->size >= num_packet); + + uint64_t write_index = + HSA::hsa_queue_add_write_index_acq_rel(queue_, num_packet); + + while (true) { + // Wait until we have room in the queue; + const uint64_t read_index = HSA::hsa_queue_load_read_index_relaxed(queue_); + if ((write_index - read_index) < queue_->size) { + break; + } + } + + return write_index; +} + +void BlitKernel::ReleaseWriteIndex(uint64_t write_index, uint32_t num_packet) { + // Launch packet. + while (true) { + // Make sure that the address before ::current_offset is already released. + // Otherwise the packet processor may read invalid packets. + uint64_t expected_offset = write_index; + if (atomic::Cas(&cached_index_, write_index + num_packet, expected_offset, + std::memory_order_release) == expected_offset) { + // Update doorbel register with last packet id. + HSA::hsa_signal_store_release(queue_->doorbell_signal, + write_index + num_packet - 1); + break; + } + } +} + +hsa_status_t BlitKernel::FenceRelease(uint64_t write_index, + uint32_t num_copy_packet, + hsa_fence_scope_t fence) { + // This function is not thread safe. + + const uint16_t kBarrierPacketHeader = + (HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE) | + (1 << HSA_PACKET_HEADER_BARRIER) | + (HSA_FENCE_SCOPE_NONE << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) | + (fence << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE); + + hsa_barrier_and_packet_t packet = {0}; + packet.header = kInvalidPacketHeader; + + HSA::hsa_signal_store_relaxed(completion_signal_, 1); + packet.completion_signal = completion_signal_; + + if (num_copy_packet == 0) { + assert(write_index == 0); + // Reserve write index. + write_index = AcquireWriteIndex(1); + } + + // Populate queue buffer with AQL packet. + hsa_barrier_and_packet_t* queue_buffer = + reinterpret_cast(queue_->base_address); + std::atomic_thread_fence(std::memory_order_acquire); + queue_buffer[(write_index + num_copy_packet) & queue_bitmask_] = packet; + std::atomic_thread_fence(std::memory_order_release); + queue_buffer[(write_index + num_copy_packet) & queue_bitmask_].header = + kBarrierPacketHeader; + + // Launch packet. + ReleaseWriteIndex(write_index, num_copy_packet + 1); + + // Wait for the packet to finish. + if (HSA::hsa_signal_wait_acquire(packet.completion_signal, + HSA_SIGNAL_CONDITION_LT, 1, uint64_t(-1), + HSA_WAIT_STATE_ACTIVE) != 0) { + // Signal wait returned unexpected value. + return HSA_STATUS_ERROR; + } + + return HSA_STATUS_SUCCESS; +} + +void BlitKernel::PopulateQueue(uint64_t index, uint64_t code_handle, void* args, + uint32_t grid_size_x, + hsa_signal_t completion_signal) { + assert(IsMultipleOf(args, 16)); + + hsa_kernel_dispatch_packet_t packet = {0}; + + static const uint16_t kDispatchPacketHeader = + (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) | + (((completion_signal.handle != 0) ? 1 : 0) << HSA_PACKET_HEADER_BARRIER) | + (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) | + (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE); + + packet.header = kInvalidPacketHeader; + packet.kernel_object = code_handle; + packet.kernarg_address = args; + + // Setup working size. + const int kNumDimension = 1; + packet.setup = kNumDimension << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS; + packet.grid_size_x = AlignUp(static_cast(grid_size_x), kGroupSize); + packet.grid_size_y = packet.grid_size_z = 1; + packet.workgroup_size_x = kGroupSize; + packet.workgroup_size_y = packet.workgroup_size_z = 1; + + packet.completion_signal = completion_signal; + + // Populate queue buffer with AQL packet. + hsa_kernel_dispatch_packet_t* queue_buffer = + reinterpret_cast(queue_->base_address); + std::atomic_thread_fence(std::memory_order_acquire); + queue_buffer[index & queue_bitmask_] = packet; + std::atomic_thread_fence(std::memory_order_release); + queue_buffer[index & queue_bitmask_].header = kDispatchPacketHeader; +} + +BlitKernel::KernelCopyArgs* BlitKernel::ObtainAsyncKernelCopyArg() { + const uint32_t index = + atomic::Add(&kernarg_async_counter_, 1U, std::memory_order_acquire); + KernelCopyArgs* arg = &kernarg_async_[index & kernarg_async_mask_]; + assert(IsMultipleOf(arg, 16)); + return arg; +} + +} // namespace amd diff --git a/runtime/hsa-runtime/core/runtime/amd_blit_sdma.cpp b/runtime/hsa-runtime/core/runtime/amd_blit_sdma.cpp new file mode 100644 index 0000000000..b22da34cd1 --- /dev/null +++ b/runtime/hsa-runtime/core/runtime/amd_blit_sdma.cpp @@ -0,0 +1,858 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#include "core/inc/amd_blit_sdma.h" + +#include +#include +#include +#include + +#include "core/inc/amd_gpu_agent.h" +#include "core/inc/runtime.h" +#include "core/inc/signal.h" + +namespace amd { +// SDMA packet for VI device. +// Reference: http://people.freedesktop.org/~agd5f/dma_packets.txt + +const unsigned int SDMA_OP_COPY = 1; +const unsigned int SDMA_OP_FENCE = 5; +const unsigned int SDMA_OP_POLL_REGMEM = 8; +const unsigned int SDMA_OP_ATOMIC = 10; +const unsigned int SDMA_OP_CONST_FILL = 11; +const unsigned int SDMA_SUBOP_COPY_LINEAR = 0; +const unsigned int SDMA_ATOMIC_ADD64 = 47; + +typedef struct SDMA_PKT_COPY_LINEAR_TAG { + union { + struct { + unsigned int op : 8; + unsigned int sub_op : 8; + unsigned int extra_info : 16; + }; + unsigned int DW_0_DATA; + } HEADER_UNION; + + union { + struct { + unsigned int count : 22; + unsigned int reserved_0 : 10; + }; + unsigned int DW_1_DATA; + } COUNT_UNION; + + union { + struct { + unsigned int reserved_0 : 16; + unsigned int dst_swap : 2; + unsigned int reserved_1 : 6; + unsigned int src_swap : 2; + unsigned int reserved_2 : 6; + }; + unsigned int DW_2_DATA; + } PARAMETER_UNION; + + union { + struct { + unsigned int src_addr_31_0 : 32; + }; + unsigned int DW_3_DATA; + } SRC_ADDR_LO_UNION; + + union { + struct { + unsigned int src_addr_63_32 : 32; + }; + unsigned int DW_4_DATA; + } SRC_ADDR_HI_UNION; + + union { + struct { + unsigned int dst_addr_31_0 : 32; + }; + unsigned int DW_5_DATA; + } DST_ADDR_LO_UNION; + + union { + struct { + unsigned int dst_addr_63_32 : 32; + }; + unsigned int DW_6_DATA; + } DST_ADDR_HI_UNION; +} SDMA_PKT_COPY_LINEAR; + +typedef struct SDMA_PKT_CONSTANT_FILL_TAG { + union { + struct { + unsigned int op : 8; + unsigned int sub_op : 8; + unsigned int sw : 2; + unsigned int reserved_0 : 12; + unsigned int fillsize : 2; + }; + unsigned int DW_0_DATA; + } HEADER_UNION; + + union { + struct { + unsigned int dst_addr_31_0 : 32; + }; + unsigned int DW_1_DATA; + } DST_ADDR_LO_UNION; + + union { + struct { + unsigned int dst_addr_63_32 : 32; + }; + unsigned int DW_2_DATA; + } DST_ADDR_HI_UNION; + + union { + struct { + unsigned int src_data_31_0 : 32; + }; + unsigned int DW_3_DATA; + } DATA_UNION; + + union { + struct { + unsigned int count : 22; + unsigned int reserved_0 : 10; + }; + unsigned int DW_4_DATA; + } COUNT_UNION; +} SDMA_PKT_CONSTANT_FILL; + +typedef struct SDMA_PKT_FENCE_TAG { + union { + struct { + unsigned int op : 8; + unsigned int sub_op : 8; + unsigned int reserved_0 : 16; + }; + unsigned int DW_0_DATA; + } HEADER_UNION; + + union { + struct { + unsigned int addr_31_0 : 32; + }; + unsigned int DW_1_DATA; + } ADDR_LO_UNION; + + union { + struct { + unsigned int addr_63_32 : 32; + }; + unsigned int DW_2_DATA; + } ADDR_HI_UNION; + + union { + struct { + unsigned int data : 32; + }; + unsigned int DW_3_DATA; + } DATA_UNION; +} SDMA_PKT_FENCE; + +typedef struct SDMA_PKT_POLL_REGMEM_TAG { + union { + struct { + unsigned int op : 8; + unsigned int sub_op : 8; + unsigned int reserved_0 : 10; + unsigned int hdp_flush : 1; + unsigned int reserved_1 : 1; + unsigned int func : 3; + unsigned int mem_poll : 1; + }; + unsigned int DW_0_DATA; + } HEADER_UNION; + + union { + struct { + unsigned int addr_31_0 : 32; + }; + unsigned int DW_1_DATA; + } ADDR_LO_UNION; + + union { + struct { + unsigned int addr_63_32 : 32; + }; + unsigned int DW_2_DATA; + } ADDR_HI_UNION; + + union { + struct { + unsigned int value : 32; + }; + unsigned int DW_3_DATA; + } VALUE_UNION; + + union { + struct { + unsigned int mask : 32; + }; + unsigned int DW_4_DATA; + } MASK_UNION; + + union { + struct { + unsigned int interval : 16; + unsigned int retry_count : 12; + unsigned int reserved_0 : 4; + }; + unsigned int DW_5_DATA; + } DW5_UNION; +} SDMA_PKT_POLL_REGMEM; + +typedef struct SDMA_PKT_ATOMIC_TAG { + union { + struct { + unsigned int op : 8; + unsigned int sub_op : 8; + unsigned int l : 1; + unsigned int reserved_0 : 8; + unsigned int operation : 7; + }; + unsigned int DW_0_DATA; + } HEADER_UNION; + + union { + struct { + unsigned int addr_31_0 : 32; + }; + unsigned int DW_1_DATA; + } ADDR_LO_UNION; + + union { + struct { + unsigned int addr_63_32 : 32; + }; + unsigned int DW_2_DATA; + } ADDR_HI_UNION; + + union { + struct { + unsigned int src_data_31_0 : 32; + }; + unsigned int DW_3_DATA; + } SRC_DATA_LO_UNION; + + union { + struct { + unsigned int src_data_63_32 : 32; + }; + unsigned int DW_4_DATA; + } SRC_DATA_HI_UNION; + + union { + struct { + unsigned int cmp_data_31_0 : 32; + }; + unsigned int DW_5_DATA; + } CMP_DATA_LO_UNION; + + union { + struct { + unsigned int cmp_data_63_32 : 32; + }; + unsigned int DW_6_DATA; + } CMP_DATA_HI_UNION; + + union { + struct { + unsigned int loop_interval : 13; + unsigned int reserved_0 : 19; + }; + unsigned int DW_7_DATA; + } LOOP_UNION; +} SDMA_PKT_ATOMIC; + +inline uint32_t ptrlow32(const void* p) { + return static_cast(reinterpret_cast(p)); +} + +inline uint32_t ptrhigh32(const void* p) { +#if defined(HSA_LARGE_MODEL) + return static_cast(reinterpret_cast(p) >> 32); +#else + return 0; +#endif +} + +BlitSdma::BlitSdma() + : core::Blit(), + queue_size_(0), + queue_start_addr_(NULL), + fence_base_addr_(NULL), + fence_pool_size_(0), + fence_pool_counter_(0), + cached_reserve_offset_(0), + cached_commit_offset_(0) { + std::memset(&queue_resource_, 0, sizeof(queue_resource_)); +} + +BlitSdma::~BlitSdma() {} + +hsa_status_t BlitSdma::Initialize(const core::Agent& agent) { + if (queue_start_addr_ != NULL && queue_size_ != 0) { + // Already initialized. + return HSA_STATUS_SUCCESS; + } + + if (agent.device_type() != core::Agent::kAmdGpuDevice) { + return HSA_STATUS_ERROR; + } + + linear_copy_command_size_ = sizeof(SDMA_PKT_COPY_LINEAR); + fill_command_size_ = sizeof(SDMA_PKT_CONSTANT_FILL); + fence_command_size_ = sizeof(SDMA_PKT_FENCE); + poll_command_size_ = sizeof(SDMA_PKT_POLL_REGMEM); + atomic_command_size_ = sizeof(SDMA_PKT_ATOMIC); + + const uint32_t sync_command_size = fence_command_size_; + const uint32_t max_num_copy_command = + std::floor((static_cast(queue_size_) - sync_command_size) / + linear_copy_command_size_); + const uint32_t max_num_fill_command = + std::floor((static_cast(queue_size_) - sync_command_size) / + fill_command_size_); + + max_single_linear_copy_size_ = 0x3fffe0; + max_total_linear_copy_size_ = static_cast( + std::min(static_cast(SIZE_MAX), + static_cast(max_num_copy_command) * + static_cast(max_single_linear_copy_size_))); + + max_single_fill_size_ = 1 << 22; + max_total_fill_size_ = static_cast( + std::min(static_cast(SIZE_MAX), + static_cast(max_num_fill_command) * + static_cast(max_single_fill_size_))); + + const amd::GpuAgent& amd_gpu_agent = static_cast(agent); + + if (amd_gpu_agent.isa()->version() != core::Isa::Version(8, 0, 3)) { + assert(false && "Only for Fiji currently"); + return HSA_STATUS_ERROR; + } + + // Allocate queue buffer. + const size_t kPageSize = 4096; + const size_t kSdmaQueueSize = 1024 * 1024; + + queue_size_ = kSdmaQueueSize; + + HsaMemFlags flags; + flags.Value = 0; + flags.ui32.HostAccess = 1; + flags.ui32.AtomicAccessPartial = 1; + flags.ui32.ExecuteAccess = 1; + + auto err = hsaKmtAllocMemory(amd_gpu_agent.node_id(), queue_size_, flags, + reinterpret_cast(&queue_start_addr_)); + + if (err != HSAKMT_STATUS_SUCCESS) { + assert(false && "SDMA queue memory allocation failure."); + return HSA_STATUS_ERROR_OUT_OF_RESOURCES; + } + + HSAuint64 alternate_va; + err = hsaKmtMapMemoryToGPU(queue_start_addr_, queue_size_, &alternate_va); + + if (err != HSAKMT_STATUS_SUCCESS) { + assert(false && "AQL queue memory map failure."); + Destroy(); + return HSA_STATUS_ERROR_OUT_OF_RESOURCES; + } + + std::memset(queue_start_addr_, 0, queue_size_); + + // Access kernel driver to initialize the queue control block + // This call binds user mode queue object to underlying compute + // device. + const GpuAgent& gpu_agent = reinterpret_cast(agent); + const HSA_QUEUE_TYPE kQueueType_ = HSA_QUEUE_SDMA; + if (HSAKMT_STATUS_SUCCESS != + hsaKmtCreateQueue(gpu_agent.node_id(), kQueueType_, 100, + HSA_QUEUE_PRIORITY_MAXIMUM, queue_start_addr_, + queue_size_, NULL, &queue_resource_)) { + Destroy(); + return HSA_STATUS_ERROR_OUT_OF_RESOURCES; + } + + cached_reserve_offset_ = *(queue_resource_.Queue_write_ptr); + cached_commit_offset_ = cached_reserve_offset_; + + fence_pool_size_ = + static_cast(std::ceil(kSdmaQueueSize / fence_command_size_)); + + fence_pool_mask_ = fence_pool_size_ - 1; + + fence_base_addr_ = reinterpret_cast( + core::Runtime::runtime_singleton_->system_allocator()( + fence_pool_size_ * sizeof(uint32_t), 256)); + + if (fence_base_addr_ == NULL) { + Destroy(); + return HSA_STATUS_ERROR_OUT_OF_RESOURCES; + } + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t BlitSdma::Destroy(void) { + // Release all allocated resources and reset them to zero. + + if (queue_resource_.QueueId != 0) { + // Release queue resources from the kernel + auto err = hsaKmtDestroyQueue(queue_resource_.QueueId); + assert(err == HSAKMT_STATUS_SUCCESS); + memset(&queue_resource_, 0, sizeof(queue_resource_)); + } + + if (queue_start_addr_ != NULL && queue_size_ != 0) { + // Release queue buffer. + hsaKmtUnmapMemoryToGPU(queue_start_addr_); + hsaKmtFreeMemory(queue_start_addr_, queue_size_); + } + + if (fence_base_addr_ != NULL) { + core::Runtime::runtime_singleton_->system_deallocator()(fence_base_addr_); + } + + queue_size_ = 0; + queue_start_addr_ = NULL; + cached_reserve_offset_ = 0; + cached_commit_offset_ = 0; + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t BlitSdma::SubmitLinearCopyCommand(void* dst, const void* src, + size_t size) { + if (size > max_total_linear_copy_size_) { + return HSA_STATUS_ERROR_OUT_OF_RESOURCES; + } + + // Break the copy into multiple copy operation incase the copy size exceeds + // the SDMA linear copy limit. + const uint32_t num_copy_command = static_cast( + std::ceil(static_cast(size) / max_single_linear_copy_size_)); + + const uint32_t total_copy_command_size = + num_copy_command * linear_copy_command_size_; + + const uint32_t total_command_size = + total_copy_command_size + fence_command_size_; + + const uint32_t kFenceValue = 2015; + uint32_t* fence_addr = ObtainFenceObject(); + *fence_addr = 0; + + char* command_addr = AcquireWriteAddress(total_command_size); + char* const command_addr_temp = command_addr; + + if (command_addr == NULL) { + return HSA_STATUS_ERROR_OUT_OF_RESOURCES; + } + + BuildCopyCommand(command_addr, num_copy_command, dst, src, size); + + command_addr += total_copy_command_size; + + BuildFenceCommand(command_addr, fence_addr, kFenceValue); + + ReleaseWriteAddress(command_addr_temp, total_command_size); + + WaitFence(fence_addr, kFenceValue); + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t BlitSdma::SubmitLinearCopyCommand( + void* dst, const void* src, size_t size, + std::vector& dep_signals, core::Signal& out_signal) { + if (size > max_total_linear_copy_size_) { + return HSA_STATUS_ERROR_OUT_OF_RESOURCES; + } + + // The signal is 64 bit value, and poll checks for 32 bit value. So we + // need to use two poll operations per dependent signal. + const uint32_t num_poll_command = + static_cast(2 * dep_signals.size()); + const uint32_t total_poll_command_size = + (num_poll_command * poll_command_size_); + + // Break the copy into multiple copy operation incase the copy size exceeds + // the SDMA linear copy limit. + const uint32_t num_copy_command = static_cast( + std::ceil(static_cast(size) / max_single_linear_copy_size_)); + const uint32_t total_copy_command_size = + num_copy_command * linear_copy_command_size_; + + const uint32_t total_command_size = + total_poll_command_size + total_copy_command_size + atomic_command_size_ + + fence_command_size_; + + const uint32_t kFenceValue = 2015; + uint32_t* fence_addr = ObtainFenceObject(); + *fence_addr = 0; + + char* command_addr = AcquireWriteAddress(total_command_size); + char* const command_addr_temp = command_addr; + + if (command_addr == NULL) { + return HSA_STATUS_ERROR_OUT_OF_RESOURCES; + } + + for (size_t i = 0; i < dep_signals.size(); ++i) { + uint32_t* signal_addr = + reinterpret_cast(dep_signals[i]->ValueLocation()); + // Wait for the higher 64 bit to 0. + BuildPollCommand(command_addr, &signal_addr[1], 0); + command_addr += poll_command_size_; + // Then wait for the lower 64 bit to 0. + BuildPollCommand(command_addr, &signal_addr[0], 0); + command_addr += poll_command_size_; + } + + // Do the transfer after all polls are satisfied. + BuildCopyCommand(command_addr, num_copy_command, dst, src, size); + + command_addr += total_copy_command_size; + + // After transfer is completed, decrement the signal. + BuildAtomicDecrementCommand(command_addr, out_signal.ValueLocation()); + + command_addr += atomic_command_size_; + + BuildFenceCommand(command_addr, fence_addr, kFenceValue); + + ReleaseWriteAddress(command_addr_temp, total_command_size); + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t BlitSdma::SubmitLinearFillCommand(void* ptr, uint32_t value, + size_t count) { + const size_t size = count * sizeof(uint32_t); + + if (size > max_total_fill_size_) { + return HSA_STATUS_ERROR_OUT_OF_RESOURCES; + } + + // Break the copy into multiple copy operation incase the copy size exceeds + // the SDMA linear copy limit. + const uint32_t num_fill_command = static_cast( + std::ceil(static_cast(size) / max_single_fill_size_)); + + const uint32_t total_fill_command_size = + num_fill_command * fill_command_size_; + + const uint32_t total_command_size = + total_fill_command_size + fence_command_size_; + + char* command_addr = AcquireWriteAddress(total_command_size); + char* const command_addr_temp = command_addr; + + if (command_addr == NULL) { + return HSA_STATUS_ERROR_OUT_OF_RESOURCES; + } + + const uint32_t fill_command_size = fill_command_size_; + size_t cur_size = 0; + for (uint32_t i = 0; i < num_fill_command; ++i) { + const uint32_t fill_size = static_cast( + std::min((size - cur_size), max_single_fill_size_)); + + void* cur_ptr = static_cast(ptr) + cur_size; + + SDMA_PKT_CONSTANT_FILL* packet_addr = + reinterpret_cast(command_addr); + + memset(packet_addr, 0, sizeof(SDMA_PKT_CONSTANT_FILL)); + + packet_addr->HEADER_UNION.op = SDMA_OP_CONST_FILL; + packet_addr->HEADER_UNION.fillsize = 2; // DW fill + + packet_addr->DST_ADDR_LO_UNION.dst_addr_31_0 = ptrlow32(cur_ptr); + packet_addr->DST_ADDR_HI_UNION.dst_addr_63_32 = ptrhigh32(cur_ptr); + + packet_addr->DATA_UNION.src_data_31_0 = value; + + packet_addr->COUNT_UNION.count = fill_size; + + command_addr += fill_command_size; + cur_size += fill_size; + } + + assert(cur_size == size); + + const uint32_t kFenceValue = 2015; + uint32_t* fence_addr = ObtainFenceObject(); + *fence_addr = 0; + + BuildFenceCommand(command_addr, fence_addr, kFenceValue); + + ReleaseWriteAddress(command_addr_temp, total_command_size); + + WaitFence(fence_addr, kFenceValue); + + return HSA_STATUS_SUCCESS; +} + +char* BlitSdma::AcquireWriteAddress(uint32_t cmd_size) { + if (cmd_size > queue_size_) { + return NULL; + } + + while (true) { + uint32_t curr_offset = + atomic::Load(&cached_reserve_offset_, std::memory_order_acquire); + const uint32_t end_offset = curr_offset + cmd_size; + + if (end_offset >= queue_size_) { + // Queue buffer is not enough to contain the new command. + + // The safe space for the new command is the start of the queue buffer to + // the last read address. + if (atomic::Load(queue_resource_.Queue_read_ptr, + std::memory_order_acquire) < cmd_size) { + // There is no safe space to use currently. + return NULL; + } + + WrapQueue(cmd_size); + + continue; + } + + if (atomic::Cas(&cached_reserve_offset_, end_offset, curr_offset, + std::memory_order_release) == curr_offset) { + return queue_start_addr_ + curr_offset; + } + } + + return NULL; +} + +void BlitSdma::UpdateWriteAndDoorbellRegister(uint32_t current_offset, + uint32_t new_offset) { + while (true) { + // Make sure that the address before ::current_offset is already released. + // Otherwise the CP may read invalid packets. + if (atomic::Load(&cached_commit_offset_, std::memory_order_acquire) == + current_offset) { + // Update write pointer and doorbel register. + atomic::Store(queue_resource_.Queue_write_ptr, new_offset); + atomic::Store(queue_resource_.Queue_DoorBell, new_offset); + atomic::Store(&cached_commit_offset_, new_offset, + std::memory_order_release); + break; + } + } +} + +void BlitSdma::ReleaseWriteAddress(char* cmd_addr, uint32_t cmd_size) { + assert(cmd_addr != NULL); + assert(cmd_addr >= queue_start_addr_); + + if (cmd_size > queue_size_) { + assert(false && "cmd_addr is outside the queue buffer range"); + return; + } + + // Update write register. + const uint32_t curent_offset = cmd_addr - queue_start_addr_; + const uint32_t new_offset = curent_offset + cmd_size; + UpdateWriteAndDoorbellRegister(curent_offset, new_offset); +} + +void BlitSdma::WrapQueue(uint32_t cmd_size) { + // Re-determine the offset into queue buffer where NOOP instructions + // should be written. + while (true) { + uint32_t curent_offset = + atomic::Load(&cached_reserve_offset_, std::memory_order_acquire); + const uint32_t end_offset = curent_offset + cmd_size; + if (end_offset < queue_size_) { + return; + } + + std::lock_guard guard(wrap_lock_); + + if (atomic::Cas(&cached_reserve_offset_, queue_size_ + 1, curent_offset, + std::memory_order_release) == curent_offset) { + // Wait till all reserved packets are commited. + while (atomic::Load(&cached_commit_offset_, std::memory_order_acquire) != + curent_offset) { + os::YieldThread(); + } + + // Fill the remainder of the queue with NOOP commands. + char* noop_address = queue_start_addr_ + curent_offset; + const size_t noop_commands_size = queue_size_ - curent_offset; + memset(noop_address, 0, noop_commands_size); + + // Update write and doorbell registers to execute NOOP instructions. + UpdateWriteAndDoorbellRegister(curent_offset, 0); + + // Wait till queue wrapped. + while (atomic::Load(queue_resource_.Queue_read_ptr, + std::memory_order_acquire) != 0) { + os::YieldThread(); + } + + // Open access to queue. + atomic::Store(&cached_reserve_offset_, 0U, std::memory_order_release); + } + } +} + +void BlitSdma::BuildFenceCommand(char* fence_command_addr, uint32_t* fence, + uint32_t fence_value) { + assert(fence_command_addr != NULL); + SDMA_PKT_FENCE* packet_addr = + reinterpret_cast(fence_command_addr); + + memset(packet_addr, 0, sizeof(SDMA_PKT_FENCE)); + + packet_addr->HEADER_UNION.op = SDMA_OP_FENCE; + + packet_addr->ADDR_LO_UNION.addr_31_0 = ptrlow32(fence); + + packet_addr->ADDR_HI_UNION.addr_63_32 = ptrhigh32(fence); + + packet_addr->DATA_UNION.data = fence_value; +} + +uint32_t* BlitSdma::ObtainFenceObject() { + const uint32_t fence_index = + atomic::Add(&fence_pool_counter_, 1U, std::memory_order_acquire); + uint32_t* fence_addr = &fence_base_addr_[fence_index & fence_pool_mask_]; + assert(IsMultipleOf(fence_addr, 4)); + return fence_addr; +} + +void BlitSdma::WaitFence(uint32_t* fence, uint32_t fence_value) { + int spin_count = 51; + while (atomic::Load(fence, std::memory_order_acquire) != fence_value) { + if (--spin_count > 0) { + continue; + } + os::YieldThread(); + } +} + +void BlitSdma::BuildCopyCommand(char* cmd_addr, uint32_t num_copy_command, + void* dst, const void* src, size_t size) { + size_t cur_size = 0; + for (uint32_t i = 0; i < num_copy_command; ++i) { + const uint32_t copy_size = static_cast( + std::min((size - cur_size), max_single_linear_copy_size_)); + + void* cur_dst = static_cast(dst) + cur_size; + const void* cur_src = static_cast(src) + cur_size; + + SDMA_PKT_COPY_LINEAR* packet_addr = + reinterpret_cast(cmd_addr); + + memset(packet_addr, 0, sizeof(SDMA_PKT_COPY_LINEAR)); + + packet_addr->HEADER_UNION.op = SDMA_OP_COPY; + packet_addr->HEADER_UNION.sub_op = SDMA_SUBOP_COPY_LINEAR; + + packet_addr->COUNT_UNION.count = copy_size; + + packet_addr->SRC_ADDR_LO_UNION.src_addr_31_0 = ptrlow32(cur_src); + packet_addr->SRC_ADDR_HI_UNION.src_addr_63_32 = ptrhigh32(cur_src); + + packet_addr->DST_ADDR_LO_UNION.dst_addr_31_0 = ptrlow32(cur_dst); + packet_addr->DST_ADDR_HI_UNION.dst_addr_63_32 = ptrhigh32(cur_dst); + + cmd_addr += linear_copy_command_size_; + cur_size += copy_size; + } + + assert(cur_size == size); +} + +void BlitSdma::BuildPollCommand(char* cmd_addr, void* addr, + uint32_t reference) { + SDMA_PKT_POLL_REGMEM* packet_addr = + reinterpret_cast(cmd_addr); + + memset(packet_addr, 0, sizeof(SDMA_PKT_POLL_REGMEM)); + + packet_addr->HEADER_UNION.op = SDMA_OP_POLL_REGMEM; + packet_addr->HEADER_UNION.mem_poll = 1; + packet_addr->HEADER_UNION.func = 0x3; // IsEqual. + packet_addr->ADDR_LO_UNION.addr_31_0 = ptrlow32(addr); + packet_addr->ADDR_HI_UNION.addr_63_32 = ptrhigh32(addr); + + packet_addr->VALUE_UNION.value = reference; + + packet_addr->MASK_UNION.mask = 0xffffffff; // Compare the whole content. + + packet_addr->DW5_UNION.interval = 0x04; + packet_addr->DW5_UNION.retry_count = 0xfff; // Retry forever. +} + +void BlitSdma::BuildAtomicDecrementCommand(char* cmd_addr, void* addr) { + SDMA_PKT_ATOMIC* packet_addr = reinterpret_cast(cmd_addr); + + memset(packet_addr, 0, sizeof(SDMA_PKT_ATOMIC)); + + packet_addr->HEADER_UNION.op = SDMA_OP_ATOMIC; + packet_addr->HEADER_UNION.operation = SDMA_ATOMIC_ADD64; + + packet_addr->ADDR_LO_UNION.addr_31_0 = ptrlow32(addr); + packet_addr->ADDR_HI_UNION.addr_63_32 = ptrhigh32(addr); + + packet_addr->SRC_DATA_LO_UNION.src_data_31_0 = 0xffffffff; + packet_addr->SRC_DATA_HI_UNION.src_data_63_32 = 0xffffffff; +} +} // namespace amd diff --git a/runtime/hsa-runtime/core/runtime/amd_cpu_agent.cpp b/runtime/hsa-runtime/core/runtime/amd_cpu_agent.cpp new file mode 100644 index 0000000000..fefa17e60e --- /dev/null +++ b/runtime/hsa-runtime/core/runtime/amd_cpu_agent.cpp @@ -0,0 +1,329 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#include "core/inc/amd_cpu_agent.h" + +#include +#include + +#include "core/inc/amd_memory_region.h" +#include "core/inc/host_queue.h" + +#include "hsa_ext_image.h" + +namespace amd { +CpuAgent::CpuAgent(HSAuint32 node, const HsaNodeProperties& node_props) + : core::Agent(node, kAmdCpuDevice), properties_(node_props) { + InitRegionList(); + + InitCacheList(); +} + +CpuAgent::~CpuAgent() { + std::for_each(regions_.begin(), regions_.end(), DeleteObject()); + regions_.clear(); +} + +void CpuAgent::InitRegionList() { + const bool is_apu_node = (properties_.NumFComputeCores > 0); + + std::vector mem_props(properties_.NumMemoryBanks); + if (HSAKMT_STATUS_SUCCESS == + hsaKmtGetNodeMemoryProperties(node_id(), properties_.NumMemoryBanks, + &mem_props[0])) { + std::vector::iterator system_prop = + std::find_if(mem_props.begin(), mem_props.end(), + [](HsaMemoryProperties prop) -> bool { + return (prop.SizeInBytes > 0 && prop.HeapType == HSA_HEAPTYPE_SYSTEM); + }); + + if (system_prop != mem_props.end()) { + MemoryRegion* system_region_fine = + new MemoryRegion(true, is_apu_node, this, *system_prop); + + regions_.push_back(system_region_fine); + + if (!is_apu_node) { + MemoryRegion* system_region_coarse = + new MemoryRegion(false, is_apu_node, this, *system_prop); + + regions_.push_back(system_region_coarse); + } + } else { + HsaMemoryProperties system_props; + std::memset(&system_props, 0, sizeof(HsaMemoryProperties)); + + const uintptr_t system_base = os::GetUserModeVirtualMemoryBase(); + const size_t system_physical_size = os::GetUsablePhysicalHostMemorySize(); + assert(system_physical_size != 0); + + system_props.HeapType = HSA_HEAPTYPE_SYSTEM; + system_props.SizeInBytes = (HSAuint64)system_physical_size; + system_props.VirtualBaseAddress = (HSAuint64)(system_base); + + MemoryRegion* system_region = + new MemoryRegion(true, is_apu_node, this, system_props); + + regions_.push_back(system_region); + } + } +} + +void CpuAgent::InitCacheList() { + // Get CPU cache information. + cache_props_.resize(properties_.NumCaches); + if (HSAKMT_STATUS_SUCCESS != + hsaKmtGetNodeCacheProperties(node_id(), properties_.CComputeIdLo, + properties_.NumCaches, &cache_props_[0])) { + cache_props_.clear(); + } else { + // Only store CPU D-cache. + for (size_t cache_id = 0; cache_id < cache_props_.size(); ++cache_id) { + const HsaCacheType type = cache_props_[cache_id].CacheType; + if (type.ui32.CPU != 1 || type.ui32.Instruction == 1) { + cache_props_.erase(cache_props_.begin() + cache_id); + --cache_id; + } + } + } +} + +hsa_status_t CpuAgent::VisitRegion(bool include_peer, + hsa_status_t (*callback)(hsa_region_t region, + void* data), + void* data) const { + if (!include_peer) { + return VisitRegion(regions_, callback, data); + } + + // Expose all system regions in the system. + hsa_status_t stat = VisitRegion( + core::Runtime::runtime_singleton_->system_regions_fine(), callback, data); + if (stat != HSA_STATUS_SUCCESS) { + return stat; + } + + return VisitRegion(core::Runtime::runtime_singleton_->system_regions_coarse(), + callback, data); +} + +hsa_status_t CpuAgent::VisitRegion( + const std::vector& regions, + hsa_status_t (*callback)(hsa_region_t region, void* data), + void* data) const { + for (const core::MemoryRegion* region : regions) { + hsa_region_t region_handle = core::MemoryRegion::Convert(region); + hsa_status_t status = callback(region_handle, data); + if (status != HSA_STATUS_SUCCESS) { + return status; + } + } + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t CpuAgent::IterateRegion( + hsa_status_t (*callback)(hsa_region_t region, void* data), + void* data) const { + return VisitRegion(true, callback, data); +} + +hsa_status_t CpuAgent::GetInfo(hsa_agent_info_t attribute, void* value) const { + const size_t kNameSize = 64; // agent, and vendor name size limit + + const size_t attribute_u = static_cast(attribute); + switch (attribute_u) { + case HSA_AGENT_INFO_NAME: + // TODO: hardcode for now, wait until SWDEV-88894 implemented + std::memset(value, 0, kNameSize); + std::memcpy(value, "CPU Device", sizeof("CPU Device")); + break; + case HSA_AGENT_INFO_VENDOR_NAME: + // TODO: hardcode for now, wait until SWDEV-88894 implemented + std::memset(value, 0, kNameSize); + std::memcpy(value, "CPU", sizeof("CPU")); + break; + case HSA_AGENT_INFO_FEATURE: + *((hsa_agent_feature_t*)value) = static_cast(0); + break; + case HSA_AGENT_INFO_MACHINE_MODEL: +#if defined(HSA_LARGE_MODEL) + *((hsa_machine_model_t*)value) = HSA_MACHINE_MODEL_LARGE; +#else + *((hsa_machine_model_t*)value) = HSA_MACHINE_MODEL_SMALL; +#endif + break; + case HSA_AGENT_INFO_BASE_PROFILE_DEFAULT_FLOAT_ROUNDING_MODES: + case HSA_AGENT_INFO_DEFAULT_FLOAT_ROUNDING_MODE: + // TODO: validate if this is true. + *((hsa_default_float_rounding_mode_t*)value) = + HSA_DEFAULT_FLOAT_ROUNDING_MODE_NEAR; + break; + case HSA_AGENT_INFO_FAST_F16_OPERATION: + // TODO: validate if this is trye. + *((bool*)value) = false; + break; + case HSA_AGENT_INFO_PROFILE: + *((hsa_profile_t*)value) = HSA_PROFILE_FULL; + break; + case HSA_AGENT_INFO_WAVEFRONT_SIZE: + *((uint32_t*)value) = 0; + break; + case HSA_AGENT_INFO_WORKGROUP_MAX_DIM: + std::memset(value, 0, sizeof(uint16_t) * 3); + break; + case HSA_AGENT_INFO_WORKGROUP_MAX_SIZE: + *((uint32_t*)value) = 0; + break; + case HSA_AGENT_INFO_GRID_MAX_DIM: + std::memset(value, 0, sizeof(hsa_dim3_t)); + break; + case HSA_AGENT_INFO_GRID_MAX_SIZE: + *((uint32_t*)value) = 0; + break; + case HSA_AGENT_INFO_FBARRIER_MAX_SIZE: + // TODO: ? + *((uint32_t*)value) = 0; + break; + case HSA_AGENT_INFO_QUEUES_MAX: + *((uint32_t*)value) = 0; + break; + case HSA_AGENT_INFO_QUEUE_MIN_SIZE: + *((uint32_t*)value) = 0; + break; + case HSA_AGENT_INFO_QUEUE_MAX_SIZE: + *((uint32_t*)value) = 0; + break; + case HSA_AGENT_INFO_QUEUE_TYPE: + *((hsa_queue_type_t*)value) = static_cast(0); + break; + case HSA_AGENT_INFO_NODE: + // TODO: associate with OS NUMA support (numactl / GetNumaProcessorNode). + *((uint32_t*)value) = node_id(); + break; + case HSA_AGENT_INFO_DEVICE: + *((hsa_device_type_t*)value) = HSA_DEVICE_TYPE_CPU; + break; + case HSA_AGENT_INFO_CACHE_SIZE: { + std::memset(value, 0, sizeof(uint32_t) * 4); + + assert(cache_props_.size() > 0 && "CPU cache info missing."); + const size_t num_cache = cache_props_.size(); + for (size_t i = 0; i < num_cache; ++i) { + const uint32_t line_level = cache_props_[i].CacheLevel; + ((uint32_t*)value)[line_level - 1] = cache_props_[i].CacheSize * 1024; + } + } break; + case HSA_AGENT_INFO_ISA: + ((hsa_isa_t*)value)->handle = 0; + break; + case HSA_AGENT_INFO_EXTENSIONS: + memset(value, 0, sizeof(uint8_t) * 128); + break; + case HSA_AGENT_INFO_VERSION_MAJOR: + *((uint16_t*)value) = 1; + break; + case HSA_AGENT_INFO_VERSION_MINOR: + *((uint16_t*)value) = 0; + break; + case HSA_EXT_AGENT_INFO_IMAGE_1D_MAX_ELEMENTS: + case HSA_EXT_AGENT_INFO_IMAGE_1DA_MAX_ELEMENTS: + case HSA_EXT_AGENT_INFO_IMAGE_1DB_MAX_ELEMENTS: + *((uint32_t*)value) = 0; + break; + case HSA_EXT_AGENT_INFO_IMAGE_2D_MAX_ELEMENTS: + case HSA_EXT_AGENT_INFO_IMAGE_2DA_MAX_ELEMENTS: + case HSA_EXT_AGENT_INFO_IMAGE_2DDEPTH_MAX_ELEMENTS: + case HSA_EXT_AGENT_INFO_IMAGE_2DADEPTH_MAX_ELEMENTS: + memset(value, 0, sizeof(uint32_t) * 2); + break; + case HSA_EXT_AGENT_INFO_IMAGE_3D_MAX_ELEMENTS: + memset(value, 0, sizeof(uint32_t) * 3); + break; + case HSA_EXT_AGENT_INFO_IMAGE_ARRAY_MAX_LAYERS: + *((uint32_t*)value) = 0; + break; + case HSA_EXT_AGENT_INFO_MAX_IMAGE_RD_HANDLES: + case HSA_EXT_AGENT_INFO_MAX_IMAGE_RORW_HANDLES: + case HSA_EXT_AGENT_INFO_MAX_SAMPLER_HANDLERS: + *((uint32_t*)value) = 0; + break; + case HSA_AMD_AGENT_INFO_CHIP_ID: + *((uint32_t*)value) = properties_.DeviceId; + break; + case HSA_AMD_AGENT_INFO_CACHELINE_SIZE: + // TODO: hardcode for now. + *((uint32_t*)value) = 64; + break; + case HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT: + *((uint32_t*)value) = properties_.NumCPUCores; + break; + case HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY: + *((uint32_t*)value) = properties_.MaxEngineClockMhzCCompute; + break; + case HSA_AMD_AGENT_INFO_DRIVER_NODE_ID: + *((uint32_t*)value) = node_id(); + break; + case HSA_AMD_AGENT_INFO_MAX_ADDRESS_WATCH_POINTS: + *((uint32_t*)value) = static_cast( + 1 << properties_.Capability.ui32.WatchPointsTotalBits); + break; + case HSA_AMD_AGENT_INFO_BDFID: + *((uint32_t*)value) = static_cast(properties_.LocationId); + break; + default: + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + break; + } + return HSA_STATUS_SUCCESS; +} + +hsa_status_t CpuAgent::QueueCreate(size_t size, hsa_queue_type_t queue_type, + core::HsaEventCallback event_callback, + void* data, uint32_t private_segment_size, + uint32_t group_segment_size, + core::Queue** queue) { + // No HW AQL packet processor on CPU device. + return HSA_STATUS_ERROR; +} + +} // namespace amd diff --git a/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp b/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp new file mode 100644 index 0000000000..5962a0c1b2 --- /dev/null +++ b/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp @@ -0,0 +1,863 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#include "core/inc/amd_gpu_agent.h" + +#include +#include +#include +#include +#include + +#include "core/inc/amd_aql_queue.h" +#include "core/inc/amd_blit_kernel.h" +#include "core/inc/amd_blit_sdma.h" +#include "core/inc/amd_memory_region.h" +#include "core/inc/interrupt_signal.h" +#include "core/inc/isa.h" +#include "core/inc/runtime.h" + +#include "utils/sp3/sp3.h" + +#include "hsa_ext_image.h" + +// Size of scratch (private) segment pre-allocated per thread, in bytes. +#define DEFAULT_SCRATCH_BYTES_PER_THREAD 2048 + +namespace amd { +GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props) + : GpuAgentInt(node), + properties_(node_props), + current_coherency_type_(HSA_AMD_COHERENCY_TYPE_COHERENT), + blit_(NULL), + is_kv_device_(false), + trap_code_buf_(NULL), + trap_code_buf_size_(0), + ape1_base_(0), + ape1_size_(0) { + const bool is_apu_node = (properties_.NumCPUCores > 0); + profile_ = (is_apu_node) ? HSA_PROFILE_FULL : HSA_PROFILE_BASE; + + HSAKMT_STATUS err = hsaKmtGetClockCounters(node_id(), &t0_); + t1_ = t0_; + assert(err == HSAKMT_STATUS_SUCCESS && "hsaGetClockCounters error"); + + // Set instruction set architecture via node property, only on GPU device. + isa_ = (core::Isa*)core::IsaRegistry::GetIsa(core::Isa::Version( + node_props.EngineId.ui32.Major, node_props.EngineId.ui32.Minor, + node_props.EngineId.ui32.Stepping)); + // Check if the device is Kaveri, only on GPU device. + if (isa_->GetMajorVersion() == 7 && isa_->GetMinorVersion() == 0 && + isa_->GetStepping() == 0) { + is_kv_device_ = true; + } + + current_coherency_type((profile_ == HSA_PROFILE_FULL) + ? HSA_AMD_COHERENCY_TYPE_COHERENT + : HSA_AMD_COHERENCY_TYPE_NONCOHERENT); + + max_queues_ = + static_cast(atoi(os::GetEnvVar("HSA_MAX_QUEUES").c_str())); +#if !defined(HSA_LARGE_MODEL) || !defined(__linux__) + if (max_queues_ == 0) { + max_queues_ = 10; + } + max_queues_ = std::min(10U, max_queues_); +#else + if (max_queues_ == 0) { + max_queues_ = 128; + } + max_queues_ = std::min(128U, max_queues_); +#endif + + // Populate region list. + InitRegionList(); + + // Reserve memory for scratch. + InitScratchPool(); + + // Populate cache list. + InitCacheList(); + + // Bind the second-level trap handler to this node. + BindTrapHandler(); +} + +GpuAgent::~GpuAgent() { + if (blit_ != NULL) { + hsa_status_t status = blit_->Destroy(); + assert(status == HSA_STATUS_SUCCESS); + + delete blit_; + blit_ = NULL; + } + + if (ape1_base_ != 0) { + _aligned_free(reinterpret_cast(ape1_base_)); + } + + if (scratch_pool_.base() != NULL) { + hsaKmtFreeMemory(scratch_pool_.base(), scratch_pool_.size()); + } + + if (trap_code_buf_ != NULL) { + ReleaseShader(trap_code_buf_, trap_code_buf_size_); + } + + std::for_each(regions_.begin(), regions_.end(), DeleteObject()); + regions_.clear(); +} + +void GpuAgent::AssembleShader(const char* src_sp3, const char* func_name, + void*& code_buf, size_t& code_buf_size) { +#ifdef __linux__ // No VS builds of libsp3 available right now + // Assemble source string with libsp3. + sp3_context* sp3 = sp3_new(); + + switch (isa_->GetMajorVersion()) { + case 7: + sp3_setasic(sp3, "CI"); + break; + case 8: + sp3_setasic(sp3, "VI"); + break; + default: + assert(false && "SP3 assembly not supported on this agent"); + } + + sp3_parse_string(sp3, src_sp3); + sp3_shader* code_sp3_meta = sp3_compile(sp3, func_name); + + // Allocate a GPU-visible buffer for the trap shader. + HsaMemFlags code_buf_flags = {0}; + code_buf_flags.ui32.HostAccess = 1; + code_buf_flags.ui32.ExecuteAccess = 1; + code_buf_flags.ui32.NoSubstitute = 1; + + size_t code_size = code_sp3_meta->size * sizeof(uint32_t); + code_buf_size = AlignUp(code_size, 0x1000); + + HSAKMT_STATUS err = + hsaKmtAllocMemory(node_id(), code_buf_size, code_buf_flags, &code_buf); + assert(err == HSAKMT_STATUS_SUCCESS && "hsaKmtAllocMemory(Trap) failed"); + + err = hsaKmtMapMemoryToGPU(code_buf, code_buf_size, NULL); + assert(err == HSAKMT_STATUS_SUCCESS && "hsaKmtMapMemoryToGPU(Trap) failed"); + + // Copy trap handler code into the GPU-visible buffer. + memset(code_buf, 0, code_buf_size); + memcpy(code_buf, code_sp3_meta->data, code_size); + + // Release SP3 resources. + sp3_free_shader(code_sp3_meta); + sp3_close(sp3); +#endif +} + +void GpuAgent::ReleaseShader(void* code_buf, size_t code_buf_size) { + hsaKmtUnmapMemoryToGPU(code_buf); + hsaKmtFreeMemory(code_buf, code_buf_size); +} + +void GpuAgent::InitRegionList() { + const bool is_apu_node = (properties_.NumCPUCores > 0); + + std::vector mem_props(properties_.NumMemoryBanks); + if (HSAKMT_STATUS_SUCCESS == + hsaKmtGetNodeMemoryProperties(node_id(), properties_.NumMemoryBanks, + &mem_props[0])) { + for (uint32_t mem_idx = 0; mem_idx < properties_.NumMemoryBanks; + ++mem_idx) { + // Ignore the one(s) with unknown size. + if (mem_props[mem_idx].SizeInBytes == 0) { + continue; + } + + switch (mem_props[mem_idx].HeapType) { + case HSA_HEAPTYPE_FRAME_BUFFER_PRIVATE: + case HSA_HEAPTYPE_FRAME_BUFFER_PUBLIC: + if (!is_apu_node) { + mem_props[mem_idx].VirtualBaseAddress = 0; + } + case HSA_HEAPTYPE_GPU_LDS: + case HSA_HEAPTYPE_GPU_SCRATCH: + case HSA_HEAPTYPE_DEVICE_SVM: { + MemoryRegion* region = + new MemoryRegion(false, false, this, mem_props[mem_idx]); + + regions_.push_back(region); + break; + } + default: + continue; + } + } + } +} + +void GpuAgent::InitScratchPool() { + HsaMemFlags flags; + flags.Value = 0; + flags.ui32.Scratch = 1; + flags.ui32.HostAccess = 1; + + scratch_per_thread_ = atoi(os::GetEnvVar("HSA_SCRATCH_MEM").c_str()); + if (scratch_per_thread_ == 0) + scratch_per_thread_ = DEFAULT_SCRATCH_BYTES_PER_THREAD; + + // Scratch length is: waves/CU * threads/wave * queues * #CUs * + // scratch/thread + const uint32_t num_cu = + properties_.NumFComputeCores / properties_.NumSIMDPerCU; + queue_scratch_len_ = 0; + queue_scratch_len_ = AlignUp(32 * 64 * num_cu * scratch_per_thread_, 65536); + size_t max_scratch_len = queue_scratch_len_ * max_queues_; + +#if defined(HSA_LARGE_MODEL) && defined(__linux__) + // For 64-bit linux use max queues unless otherwise specified + if ((max_scratch_len == 0) || (max_scratch_len > 4294967296)) { + max_scratch_len = 4294967296; // 4GB apeture max + } +#endif + + void* scratch_base; + HSAKMT_STATUS err = + hsaKmtAllocMemory(node_id(), max_scratch_len, flags, &scratch_base); + assert(err == HSAKMT_STATUS_SUCCESS && "hsaKmtAllocMemory(Scratch) failed"); + assert(IsMultipleOf(scratch_base, 0x1000) && + "Scratch base is not page aligned!"); + + scratch_pool_. ~SmallHeap(); + if (HSAKMT_STATUS_SUCCESS == err) { + new (&scratch_pool_) SmallHeap(scratch_base, max_scratch_len); + } else { + new (&scratch_pool_) SmallHeap(NULL, 0); + } +} + +void GpuAgent::InitCacheList() { + // Get GPU cache information. + // Similar to getting CPU cache but here we use FComputeIdLo. + cache_props_.resize(properties_.NumCaches); + if (HSAKMT_STATUS_SUCCESS != + hsaKmtGetNodeCacheProperties(node_id(), properties_.FComputeIdLo, + properties_.NumCaches, &cache_props_[0])) { + cache_props_.clear(); + } else { + // Only store GPU D-cache. + for (size_t cache_id = 0; cache_id < cache_props_.size(); ++cache_id) { + const HsaCacheType type = cache_props_[cache_id].CacheType; + if (type.ui32.HSACU != 1 || type.ui32.Instruction == 1) { + cache_props_.erase(cache_props_.begin() + cache_id); + --cache_id; + } + } + } +} + +hsa_status_t GpuAgent::IterateRegion( + hsa_status_t (*callback)(hsa_region_t region, void* data), + void* data) const { + return VisitRegion(true, callback, data); +} + +hsa_status_t GpuAgent::VisitRegion(bool include_peer, + hsa_status_t (*callback)(hsa_region_t region, + void* data), + void* data) const { + if (include_peer) { + // Only expose system, local, and LDS memory of the blit agent. + if (this->node_id() == + core::Runtime::runtime_singleton_->blit_agent()->node_id()) { + hsa_status_t stat = VisitRegion(regions_, callback, data); + if (stat != HSA_STATUS_SUCCESS) { + return stat; + } + } + + // Also expose system regions accessible by this agent. + hsa_status_t stat = + VisitRegion(core::Runtime::runtime_singleton_->system_regions_fine(), + callback, data); + if (stat != HSA_STATUS_SUCCESS) { + return stat; + } + + return VisitRegion( + core::Runtime::runtime_singleton_->system_regions_coarse(), callback, + data); + } + + // Only expose system, local, and LDS memory of this agent. + return VisitRegion(regions_, callback, data); +} + +hsa_status_t GpuAgent::VisitRegion( + const std::vector& regions, + hsa_status_t (*callback)(hsa_region_t region, void* data), + void* data) const { + for (const core::MemoryRegion* region : regions) { + const amd::MemoryRegion* amd_region = + reinterpret_cast(region); + + // Only expose system, local, and LDS memory. + if (amd_region->IsSystem() || amd_region->IsLocalMemory() || + amd_region->IsLDS()) { + hsa_region_t region_handle = core::MemoryRegion::Convert(region); + hsa_status_t status = callback(region_handle, data); + if (status != HSA_STATUS_SUCCESS) { + return status; + } + } + } + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t GpuAgent::InitDma() { + // Try create SDMA blit first. + std::string sdma_enable = os::GetEnvVar("HSA_ENABLE_SDMA"); + + if (sdma_enable != "0" && isa_->GetMajorVersion() == 8 && + isa_->GetMinorVersion() == 0 && isa_->GetStepping() == 3) { + blit_ = new BlitSdma(); + + if (blit_->Initialize(*this) == HSA_STATUS_SUCCESS) { + return HSA_STATUS_SUCCESS; + } + + // Fall back to blit kernel if SDMA is unavailable. + blit_->Destroy(); + delete blit_; + blit_ = NULL; + } + + assert(blit_ == NULL); + blit_ = new BlitKernel(); + + if (blit_->Initialize(*this) != HSA_STATUS_SUCCESS) { + blit_->Destroy(); + delete blit_; + blit_ = NULL; + + return HSA_STATUS_ERROR_OUT_OF_RESOURCES; + } + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t GpuAgent::DmaCopy(void* dst, const void* src, size_t size) { + if (blit_ == NULL) { + return HSA_STATUS_ERROR_OUT_OF_RESOURCES; + } + + return blit_->SubmitLinearCopyCommand(dst, src, size); +} + +hsa_status_t GpuAgent::DmaCopy(void* dst, const void* src, size_t size, + std::vector& dep_signals, + core::Signal& out_signal) { + if (blit_ == NULL) { + return HSA_STATUS_ERROR_OUT_OF_RESOURCES; + } + + // TODO(bwicakso): temporarily disable wait on thunk event if the out_signal + // is an interrupt signal object. Remove this when SDMA handle interrupt + // packet properly. + if (out_signal.EopEvent() != NULL) { + static_cast(out_signal).DisableWaitEvent(); + } + + return blit_->SubmitLinearCopyCommand(dst, src, size, dep_signals, + out_signal); +} + +hsa_status_t GpuAgent::DmaFill(void* ptr, uint32_t value, size_t count) { + if (blit_ == NULL) { + return HSA_STATUS_ERROR_OUT_OF_RESOURCES; + } + + return blit_->SubmitLinearFillCommand(ptr, value, count); +} + +hsa_status_t GpuAgent::GetInfo(hsa_agent_info_t attribute, void* value) const { + const size_t kNameSize = 64; // agent, and vendor name size limit + + const core::ExtensionEntryPoints& extensions = + core::Runtime::runtime_singleton_->extensions_; + + hsa_agent_t agent = core::Agent::Convert(this); + + const size_t attribute_u = static_cast(attribute); + switch (attribute_u) { + case HSA_AGENT_INFO_NAME: + // TODO(bwicakso): hardcode for now. + std::memset(value, 0, kNameSize); + if (isa_->GetMajorVersion() == 7) { + std::memcpy(value, "Kaveri", sizeof("Kaveri")); + } else if (isa_->GetMajorVersion() == 8) { + if (isa_->GetMinorVersion() == 0 && isa_->GetStepping() == 2) { + std::memcpy(value, "Tonga", sizeof("Tonga")); + } else if (isa_->GetMinorVersion() == 0 && isa_->GetStepping() == 3) { + std::memcpy(value, "Fiji", sizeof("Fiji")); + } else { + std::memcpy(value, "Carrizo", sizeof("Carrizo")); + } + } else { + std::memcpy(value, "Unknown", sizeof("Unknown")); + } + break; + case HSA_AGENT_INFO_VENDOR_NAME: + std::memset(value, 0, kNameSize); + std::memcpy(value, "AMD", sizeof("AMD")); + break; + case HSA_AGENT_INFO_FEATURE: + *((hsa_agent_feature_t*)value) = HSA_AGENT_FEATURE_KERNEL_DISPATCH; + break; + case HSA_AGENT_INFO_MACHINE_MODEL: +#if defined(HSA_LARGE_MODEL) + *((hsa_machine_model_t*)value) = HSA_MACHINE_MODEL_LARGE; +#else + *((hsa_machine_model_t*)value) = HSA_MACHINE_MODEL_SMALL; +#endif + break; + case HSA_AGENT_INFO_BASE_PROFILE_DEFAULT_FLOAT_ROUNDING_MODES: + case HSA_AGENT_INFO_DEFAULT_FLOAT_ROUNDING_MODE: + *((hsa_default_float_rounding_mode_t*)value) = + HSA_DEFAULT_FLOAT_ROUNDING_MODE_NEAR; + break; + case HSA_AGENT_INFO_FAST_F16_OPERATION: + *((bool*)value) = false; + break; + case HSA_AGENT_INFO_PROFILE: + *((hsa_profile_t*)value) = profile_; + break; + case HSA_AGENT_INFO_WAVEFRONT_SIZE: + *((uint32_t*)value) = properties_.WaveFrontSize; + break; + case HSA_AGENT_INFO_WORKGROUP_MAX_DIM: { + // TODO: must be per-device + const uint16_t group_size[3] = {1024, 1024, 1024}; + std::memcpy(value, group_size, sizeof(group_size)); + } break; + case HSA_AGENT_INFO_WORKGROUP_MAX_SIZE: + // TODO: must be per-device + *((uint32_t*)value) = 1024; + break; + case HSA_AGENT_INFO_GRID_MAX_DIM: { + const hsa_dim3_t grid_size = {UINT32_MAX, UINT32_MAX, UINT32_MAX}; + std::memcpy(value, &grid_size, sizeof(hsa_dim3_t)); + } break; + case HSA_AGENT_INFO_GRID_MAX_SIZE: + *((uint32_t*)value) = UINT32_MAX; + break; + case HSA_AGENT_INFO_FBARRIER_MAX_SIZE: + // TODO: to confirm + *((uint32_t*)value) = 32; + break; + case HSA_AGENT_INFO_QUEUES_MAX: + *((uint32_t*)value) = max_queues_; + break; + case HSA_AGENT_INFO_QUEUE_MIN_SIZE: + *((uint32_t*)value) = minAqlSize_; + break; + case HSA_AGENT_INFO_QUEUE_MAX_SIZE: + *((uint32_t*)value) = maxAqlSize_; + break; + case HSA_AGENT_INFO_QUEUE_TYPE: + *((hsa_queue_type_t*)value) = HSA_QUEUE_TYPE_MULTI; + break; + case HSA_AGENT_INFO_NODE: + // TODO: associate with OS NUMA support (numactl / GetNumaProcessorNode). + *((uint32_t*)value) = node_id(); + break; + case HSA_AGENT_INFO_DEVICE: + *((hsa_device_type_t*)value) = HSA_DEVICE_TYPE_GPU; + break; + case HSA_AGENT_INFO_CACHE_SIZE: + std::memset(value, 0, sizeof(uint32_t) * 4); + // TODO: no GPU cache info from KFD. Hardcode for now. + // GCN whitepaper: L1 data cache is 16KB. + ((uint32_t*)value)[0] = 16 * 1024; + break; + case HSA_AGENT_INFO_ISA: + *((hsa_isa_t*)value) = core::Isa::Handle(isa_); + break; + case HSA_AGENT_INFO_EXTENSIONS: + memset(value, 0, sizeof(uint8_t) * 128); + + if (extensions.table.hsa_ext_program_finalize_fn != NULL) { + *((uint8_t*)value) = 1 << HSA_EXTENSION_FINALIZER; + } + + if (profile_ == HSA_PROFILE_FULL && + extensions.table.hsa_ext_image_create_fn != NULL) { + // TODO(bwicakso): only APU supports images currently. + *((uint8_t*)value) |= 1 << HSA_EXTENSION_IMAGES; + } + + *((uint8_t*)value) |= 1 << HSA_EXTENSION_AMD_PROFILER; + + break; + case HSA_AGENT_INFO_VERSION_MAJOR: + *((uint16_t*)value) = 1; + break; + case HSA_AGENT_INFO_VERSION_MINOR: + *((uint16_t*)value) = 0; + break; + case HSA_EXT_AGENT_INFO_IMAGE_1D_MAX_ELEMENTS: + case HSA_EXT_AGENT_INFO_IMAGE_1DA_MAX_ELEMENTS: + case HSA_EXT_AGENT_INFO_IMAGE_1DB_MAX_ELEMENTS: + case HSA_EXT_AGENT_INFO_IMAGE_2D_MAX_ELEMENTS: + case HSA_EXT_AGENT_INFO_IMAGE_2DA_MAX_ELEMENTS: + case HSA_EXT_AGENT_INFO_IMAGE_2DDEPTH_MAX_ELEMENTS: + case HSA_EXT_AGENT_INFO_IMAGE_2DADEPTH_MAX_ELEMENTS: + case HSA_EXT_AGENT_INFO_IMAGE_3D_MAX_ELEMENTS: + case HSA_EXT_AGENT_INFO_IMAGE_ARRAY_MAX_LAYERS: + return hsa_amd_image_get_info_max_dim(public_handle(), attribute, value); + case HSA_EXT_AGENT_INFO_MAX_IMAGE_RD_HANDLES: + // TODO: hardcode based on OCL constants. + *((uint32_t*)value) = 128; + break; + case HSA_EXT_AGENT_INFO_MAX_IMAGE_RORW_HANDLES: + // TODO: hardcode based on OCL constants. + *((uint32_t*)value) = 64; + break; + case HSA_EXT_AGENT_INFO_MAX_SAMPLER_HANDLERS: + // TODO: hardcode based on OCL constants. + *((uint32_t*)value) = 16; + case HSA_AMD_AGENT_INFO_CHIP_ID: + *((uint32_t*)value) = properties_.DeviceId; + break; + case HSA_AMD_AGENT_INFO_CACHELINE_SIZE: + // TODO: hardcode for now. + // GCN whitepaper: cache line size is 64 byte long. + *((uint32_t*)value) = 64; + break; + case HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT: + *((uint32_t*)value) = + (properties_.NumFComputeCores / properties_.NumSIMDPerCU); + break; + case HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY: + *((uint32_t*)value) = properties_.MaxEngineClockMhzFCompute; + break; + case HSA_AMD_AGENT_INFO_DRIVER_NODE_ID: + *((uint32_t*)value) = node_id(); + break; + case HSA_AMD_AGENT_INFO_MAX_ADDRESS_WATCH_POINTS: + *((uint32_t*)value) = static_cast( + 1 << properties_.Capability.ui32.WatchPointsTotalBits); + break; + case HSA_AMD_AGENT_INFO_BDFID: + *((uint32_t*)value) = static_cast(properties_.LocationId); + break; + default: + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + break; + } + return HSA_STATUS_SUCCESS; +} + +hsa_status_t GpuAgent::QueueCreate(size_t size, hsa_queue_type_t queue_type, + core::HsaEventCallback event_callback, + void* data, uint32_t private_segment_size, + uint32_t group_segment_size, + core::Queue** queue) { + // AQL queues must be a power of two in length. + if (!IsPowerOfTwo(size)) { + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + + // Enforce max size + if (size > maxAqlSize_) { + return HSA_STATUS_ERROR_OUT_OF_RESOURCES; + } + + // Allocate scratch memory + ScratchInfo scratch; +#if defined(HSA_LARGE_MODEL) && defined(__linux__) + if (core::g_use_interrupt_wait) { + if (private_segment_size == UINT_MAX) { + private_segment_size = + (profile_ == HSA_PROFILE_BASE) ? 0 : scratch_per_thread_; + } + + if (private_segment_size > 262128) { + return HSA_STATUS_ERROR_OUT_OF_RESOURCES; + } + + scratch.size_per_thread = AlignUp(private_segment_size, 16); + if (scratch.size_per_thread > 262128) { + return HSA_STATUS_ERROR_OUT_OF_RESOURCES; + } + + const uint32_t num_cu = + properties_.NumFComputeCores / properties_.NumSIMDPerCU; + scratch.size = scratch.size_per_thread * 32 * 64 * num_cu; + } else { + scratch.size = queue_scratch_len_; + scratch.size_per_thread = scratch_per_thread_; + } +#else + scratch.size = queue_scratch_len_; + scratch.size_per_thread = scratch_per_thread_; +#endif + scratch.queue_base = NULL; + if (scratch.size != 0) { + AcquireQueueScratch(scratch); + if (scratch.queue_base == NULL) { + return HSA_STATUS_ERROR_OUT_OF_RESOURCES; + } + } + + // Create an HW AQL queue + AqlQueue* hw_queue = new AqlQueue(this, size, node_id(), scratch, + event_callback, data, is_kv_device_); + if (hw_queue && hw_queue->IsValid()) { + // return queue + *queue = hw_queue; + return HSA_STATUS_SUCCESS; + } + // If reached here its always an ERROR. + delete hw_queue; + ReleaseQueueScratch(scratch.queue_base); + return HSA_STATUS_ERROR_OUT_OF_RESOURCES; +} + +void GpuAgent::AcquireQueueScratch(ScratchInfo& scratch) { + if (scratch.size == 0) { + scratch.size = queue_scratch_len_; + scratch.size_per_thread = scratch_per_thread_; + } + + ScopedAcquire lock(&scratch_lock_); + scratch.queue_base = scratch_pool_.alloc(scratch.size); + scratch.queue_process_offset = + uintptr_t(scratch.queue_base) - uintptr_t(scratch_pool_.base()); + + if ((scratch.queue_base != NULL) && (profile_ == HSA_PROFILE_BASE)) { + HSAuint64 alternate_va; + if (HSAKMT_STATUS_SUCCESS != + hsaKmtMapMemoryToGPU(scratch.queue_base, scratch.size, &alternate_va)) { + assert(false && "Map scratch subrange failed!"); + scratch_pool_.free(scratch.queue_base); + scratch.queue_base = NULL; + } + } +} + +void GpuAgent::ReleaseQueueScratch(void* base) { + if (base == NULL) { + return; + } + + ScopedAcquire lock(&scratch_lock_); + if (profile_ == HSA_PROFILE_BASE) { + if (HSAKMT_STATUS_SUCCESS != hsaKmtUnmapMemoryToGPU(base)) { + assert(false && "Unmap scratch subrange failed!"); + } + } + scratch_pool_.free(base); +} + +void GpuAgent::TranslateTime(core::Signal* signal, + hsa_amd_profiling_dispatch_time_t& time) { + // Ensure interpolation + ScopedAcquire lock(&t1_lock_); + if (t1_.GPUClockCounter < signal->signal_.end_ts) { + SyncClocks(); + } + + time.start = uint64_t( + (double(int64_t(t0_.SystemClockCounter - t1_.SystemClockCounter)) / + double(int64_t(t0_.GPUClockCounter - t1_.GPUClockCounter))) * + double(int64_t(signal->signal_.start_ts - t1_.GPUClockCounter)) + + double(t1_.SystemClockCounter)); + time.end = uint64_t( + (double(int64_t(t0_.SystemClockCounter - t1_.SystemClockCounter)) / + double(int64_t(t0_.GPUClockCounter - t1_.GPUClockCounter))) * + double(int64_t(signal->signal_.end_ts - t1_.GPUClockCounter)) + + double(t1_.SystemClockCounter)); +} + +uint64_t GpuAgent::TranslateTime(uint64_t tick) { + ScopedAcquire lock(&t1_lock_); + SyncClocks(); + + uint64_t system_tick = 0; + system_tick = uint64_t( + (double(int64_t(t0_.SystemClockCounter - t1_.SystemClockCounter)) / + double(int64_t(t0_.GPUClockCounter - t1_.GPUClockCounter))) * + double(int64_t(tick - t1_.GPUClockCounter)) + + double(t1_.SystemClockCounter)); + return system_tick; +} + +bool GpuAgent::current_coherency_type(hsa_amd_coherency_type_t type) { + if (!is_kv_device_) { + current_coherency_type_ = type; + return true; + } + + ScopedAcquire Lock(&coherency_lock_); + + if (ape1_base_ == 0 && ape1_size_ == 0) { + static const size_t kApe1Alignment = 64 * 1024; + ape1_size_ = kApe1Alignment; + ape1_base_ = reinterpret_cast( + _aligned_malloc(ape1_size_, kApe1Alignment)); + assert((ape1_base_ != 0) && ("APE1 allocation failed")); + } else if (type == current_coherency_type_) { + return true; + } + + HSA_CACHING_TYPE type0, type1; + if (type == HSA_AMD_COHERENCY_TYPE_COHERENT) { + type0 = HSA_CACHING_CACHED; + type1 = HSA_CACHING_NONCACHED; + } else { + type0 = HSA_CACHING_NONCACHED; + type1 = HSA_CACHING_CACHED; + } + + if (hsaKmtSetMemoryPolicy(node_id(), type0, type1, + reinterpret_cast(ape1_base_), + ape1_size_) != HSAKMT_STATUS_SUCCESS) { + return false; + } + current_coherency_type_ = type; + return true; +} + +uint16_t GpuAgent::GetMicrocodeVersion() const { + return (properties_.EngineId.ui32.uCode); +} + +void GpuAgent::SyncClocks() { + HSAKMT_STATUS err = hsaKmtGetClockCounters(node_id(), &t1_); + assert(err == HSAKMT_STATUS_SUCCESS && "hsaGetClockCounters error"); +} + +void GpuAgent::BindTrapHandler() { +#ifdef __linux__ // No raw string literal support in VS builds right now + const char* src_sp3 = R"( + var s_trap_info_lo = ttmp0 + var s_trap_info_hi = ttmp1 + var s_tmp0 = ttmp2 + var s_tmp1 = ttmp3 + var s_tmp2 = ttmp4 + var s_tmp3 = ttmp5 + + shader TrapHandler + type(CS) + + // Retrieve the queue inactive signal. + s_load_dwordx2 [s_tmp0, s_tmp1], s[0:1], 0xC0 + s_waitcnt lgkmcnt(0) + + // Mask all but one lane of the wavefront. + s_mov_b64 exec, 0x1 + + // Set queue signal value to unhandled exception error. + s_add_u32 s_tmp0, s_tmp0, 0x8 + s_addc_u32 s_tmp1, s_tmp1, 0x0 + v_mov_b32 v0, s_tmp0 + v_mov_b32 v1, s_tmp1 + v_mov_b32 v2, 0x80000000 + v_mov_b32 v3, 0x0 + flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] + s_waitcnt vmcnt(0) + + // Skip event if the signal was already set to unhandled exception. + v_cmp_eq_u64 vcc, v[0:1], v[2:3] + s_cbranch_vccnz L_SIGNAL_DONE + + // Check for a non-NULL signal event mailbox. + s_load_dwordx2 [s_tmp2, s_tmp3], [s_tmp0, s_tmp1], 0x8 + s_waitcnt lgkmcnt(0) + s_and_b64 [s_tmp2, s_tmp3], [s_tmp2, s_tmp3], [s_tmp2, s_tmp3] + s_cbranch_scc0 L_SIGNAL_DONE + + // Load the signal event value. + s_add_u32 s_tmp0, s_tmp0, 0x10 + s_addc_u32 s_tmp1, s_tmp1, 0x0 + s_load_dword s_tmp0, [s_tmp0, s_tmp1], 0x0 + s_waitcnt lgkmcnt(0) + + // Write the signal event value to the mailbox. + v_mov_b32 v0, s_tmp2 + v_mov_b32 v1, s_tmp3 + v_mov_b32 v2, s_tmp0 + flat_store_dword v[0:1], v2 + s_waitcnt vmcnt(0) + + // Send an interrupt to trigger event notification. + s_sendmsg sendmsg(MSG_INTERRUPT) + + L_SIGNAL_DONE: + // Halt wavefront and exit trap. + s_sethalt 1 + s_rfe_b64 [s_trap_info_lo, s_trap_info_hi] + end + )"; + + if (isa_->GetMajorVersion() == 7) { + // No trap handler support on Gfx7, soft error. + return; + } + + // Disable trap handler on Carrizo until KFD is fixed. + if (profile_ == HSA_PROFILE_FULL) { + return; + } + + // Assemble the trap handler source code. + AssembleShader(src_sp3, "TrapHandler", trap_code_buf_, trap_code_buf_size_); + + // Bind the trap handler to this node. + HSAKMT_STATUS err = hsaKmtSetTrapHandler(node_id(), trap_code_buf_, + trap_code_buf_size_, NULL, 0); + assert(err == HSAKMT_STATUS_SUCCESS && "hsaKmtSetTrapHandler() failed"); +#endif +} + +} // namespace diff --git a/runtime/hsa-runtime/core/runtime/amd_load_map.cpp b/runtime/hsa-runtime/core/runtime/amd_load_map.cpp new file mode 100644 index 0000000000..a3f9ac9c9a --- /dev/null +++ b/runtime/hsa-runtime/core/runtime/amd_load_map.cpp @@ -0,0 +1,172 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#include +#include "core/inc/amd_hsa_loader.hpp" +#include "core/inc/amd_load_map.h" +#include "core/inc/runtime.h" + +using amd::hsa::loader::Executable; +using amd::hsa::loader::LoadedCodeObject; +using amd::hsa::loader::LoadedSegment; + +hsa_status_t amd_executable_load_code_object( + hsa_executable_t executable, + hsa_agent_t agent, + hsa_code_object_t code_object, + const char *options, + amd_loaded_code_object_t *loaded_code_object) +{ + if (!core::Runtime::runtime_singleton_->IsOpen()) { + return HSA_STATUS_ERROR_NOT_INITIALIZED; + } + if (nullptr == loaded_code_object) { + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + + Executable *exec = Executable::Object(executable); + if (nullptr == exec) { + return HSA_STATUS_ERROR_INVALID_EXECUTABLE; + } + return exec->LoadCodeObject(agent, code_object, options, loaded_code_object); +} + +hsa_status_t amd_iterate_executables( + hsa_status_t (*callback)( + hsa_executable_t executable, + void *data), + void *data) +{ + if (!core::Runtime::runtime_singleton_->IsOpen()) { + return HSA_STATUS_ERROR_NOT_INITIALIZED; + } + if (nullptr == callback) { + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + + return core::Runtime::runtime_singleton_->loader()->IterateExecutables(callback, data); +} + +hsa_status_t amd_executable_iterate_loaded_code_objects( + hsa_executable_t executable, + hsa_status_t (*callback)( + amd_loaded_code_object_t loaded_code_object, + void *data), + void *data) +{ + if (!core::Runtime::runtime_singleton_->IsOpen()) { + return HSA_STATUS_ERROR_NOT_INITIALIZED; + } + if (nullptr == callback) { + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + + Executable *exec = Executable::Object(executable); + if (nullptr == exec) { + return HSA_STATUS_ERROR_INVALID_EXECUTABLE; + } + return exec->IterateLoadedCodeObjects(callback, data); +} + +hsa_status_t amd_loaded_code_object_get_info( + amd_loaded_code_object_t loaded_code_object, + amd_loaded_code_object_info_t attribute, + void *value) +{ + if (!core::Runtime::runtime_singleton_->IsOpen()) { + return HSA_STATUS_ERROR_NOT_INITIALIZED; + } + if (nullptr == value) { + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + + LoadedCodeObject *obj = LoadedCodeObject::Object(loaded_code_object); + if (nullptr == obj) { + // \todo: new error code: AMD_STATUS_ERROR_INVALID_LOADED_CODE_OBJECT. + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + return false == obj->GetInfo(attribute, value) ? + HSA_STATUS_ERROR_INVALID_ARGUMENT : HSA_STATUS_SUCCESS; +} + +hsa_status_t amd_loaded_code_object_iterate_loaded_segments( + amd_loaded_code_object_t loaded_code_object, + hsa_status_t (*callback)( + amd_loaded_segment_t loaded_segment, + void *data), + void *data) +{ + if (!core::Runtime::runtime_singleton_->IsOpen()) { + return HSA_STATUS_ERROR_NOT_INITIALIZED; + } + if (nullptr == callback) { + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + + LoadedCodeObject *obj = LoadedCodeObject::Object(loaded_code_object); + if (nullptr == obj) { + // \todo: new error code: AMD_STATUS_ERROR_INVALID_LOADED_CODE_OBJECT. + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + return obj->IterateLoadedSegments(callback, data); +} + +hsa_status_t amd_loaded_segment_get_info( + amd_loaded_segment_t loaded_segment, + amd_loaded_segment_info_t attribute, + void *value) +{ + if (!core::Runtime::runtime_singleton_->IsOpen()) { + return HSA_STATUS_ERROR_NOT_INITIALIZED; + } + if (nullptr == value) { + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + + LoadedSegment *obj = LoadedSegment::Object(loaded_segment); + if (nullptr == obj) { + // \todo: new error code: AMD_STATUS_ERROR_INVALID_LOADED_SEGMENT. + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + return false == obj->GetInfo(attribute, value) ? + HSA_STATUS_ERROR_INVALID_ARGUMENT : HSA_STATUS_SUCCESS; +} diff --git a/runtime/hsa-runtime/core/runtime/amd_loader_context.cpp b/runtime/hsa-runtime/core/runtime/amd_loader_context.cpp new file mode 100644 index 0000000000..ec7a91720b --- /dev/null +++ b/runtime/hsa-runtime/core/runtime/amd_loader_context.cpp @@ -0,0 +1,588 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#include "core/inc/amd_loader_context.hpp" + +#include +#include +#include + +#include "core/inc/amd_gpu_agent.h" +#include "core/inc/amd_memory_region.h" +#include "core/util/os.h" + +#include +#include +#include "core/inc/hsa_internal.h" +#include "core/util/utils.h" +#include "inc/hsa_ext_amd.h" + +#if defined(_WIN32) || defined(_WIN64) +#include +#else +#include +#endif + +namespace { + +bool IsLocalRegion(const core::MemoryRegion *region) +{ + const amd::MemoryRegion *amd_region = (amd::MemoryRegion*)region; + if (nullptr == amd_region || !amd_region->IsLocalMemory()) { + return false; + } + return true; +} + +bool IsDebuggerRegistered() +{ + return false; + // Leaving code commented as it will be used later on + // return (("1" == os::GetEnvVar("HSA_EMULATE_AQL")) && + // (0 != os::GetEnvVar("HSA_TOOLS_LIB").size())); +} + +class SegmentMemory { +public: + virtual ~SegmentMemory() {} + virtual void* Address(size_t offset = 0) const = 0; + virtual void* HostAddress(size_t offset = 0) const = 0; + virtual bool Allocated() const = 0; + virtual bool Allocate(size_t size, size_t align, bool zero) = 0; + virtual bool Copy(size_t offset, const void *src, size_t size) = 0; + virtual void Free() = 0; + virtual bool Freeze() = 0; + +protected: + SegmentMemory() {} + +private: + SegmentMemory(const SegmentMemory&); + SegmentMemory& operator=(const SegmentMemory&); +}; + +class MallocedMemory final: public SegmentMemory { +public: + MallocedMemory(): SegmentMemory(), ptr_(nullptr), size_(0) {} + ~MallocedMemory() {} + + void* Address(size_t offset = 0) const override + { assert(this->Allocated()); return (char*)ptr_ + offset; } + void* HostAddress(size_t offset = 0) const override + { assert(false); return nullptr; } + bool Allocated() const override + { return nullptr != ptr_; } + + bool Allocate(size_t size, size_t align, bool zero) override; + bool Copy(size_t offset, const void *src, size_t size) override; + void Free() override; + bool Freeze() override; + +private: + MallocedMemory(const MallocedMemory&); + MallocedMemory& operator=(const MallocedMemory&); + + void *ptr_; + size_t size_; +}; + +bool MallocedMemory::Allocate(size_t size, size_t align, bool zero) +{ + assert(!this->Allocated()); + assert(0 < size); + assert(0 < align && 0 == (align & (align - 1))); + ptr_ = _aligned_malloc(size, align); + if (nullptr == ptr_) { + return false; + } + if (HSA_STATUS_SUCCESS != HSA::hsa_memory_register(ptr_, size)) { + _aligned_free(ptr_); + ptr_ = nullptr; + return false; + } + if (zero) { + memset(ptr_, 0x0, size); + } + size_ = size; + return true; +} + +bool MallocedMemory::Copy(size_t offset, const void *src, size_t size) +{ + assert(this->Allocated()); + assert(nullptr != src); + assert(0 < size); + memcpy(this->Address(offset), src, size); + return true; +} + +void MallocedMemory::Free() +{ + assert(this->Allocated()); + HSA::hsa_memory_deregister(ptr_, size_); + _aligned_free(ptr_); + ptr_ = nullptr; + size_ = 0; +} + +bool MallocedMemory::Freeze() +{ + assert(this->Allocated()); + return true; +} + +class MappedMemory final: public SegmentMemory { +public: + MappedMemory(bool is_kv = false): SegmentMemory(), is_kv_(is_kv), ptr_(nullptr), size_(0) {} + ~MappedMemory() {} + + void* Address(size_t offset = 0) const override + { assert(this->Allocated()); return (char*)ptr_ + offset; } + void* HostAddress(size_t offset = 0) const override + { assert(false); return nullptr; } + bool Allocated() const override + { return nullptr != ptr_; } + + bool Allocate(size_t size, size_t align, bool zero) override; + bool Copy(size_t offset, const void *src, size_t size) override; + void Free() override; + bool Freeze() override; + +private: + MappedMemory(const MappedMemory&); + MappedMemory& operator=(const MappedMemory&); + + bool is_kv_; + void *ptr_; + size_t size_; +}; + +bool MappedMemory::Allocate(size_t size, size_t align, bool zero) +{ + assert(!this->Allocated()); + assert(0 < size); + assert(0 < align && 0 == (align & (align - 1))); +#if defined(_WIN32) || defined(_WIN64) + ptr_ = (void*)VirtualAlloc(nullptr, size, MEM_COMMIT | MEM_RESERVE, PAGE_EXECUTE_READWRITE); +#else + ptr_ = is_kv_ ? + mmap(nullptr, size, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0) : + mmap(nullptr, size, PROT_EXEC | PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); +#endif // _WIN32 || _WIN64 + if (nullptr == ptr_) { + return false; + } + assert(0 == ((uintptr_t)ptr_) % align); + if (HSA_STATUS_SUCCESS != HSA::hsa_memory_register(ptr_, size)) { +#if defined(_WIN32) || defined(_WIN64) + VirtualFree(ptr_, size, MEM_DECOMMIT); + VirtualFree(ptr_, 0, MEM_RELEASE); +#else + munmap(ptr_, size); +#endif // _WIN32 || _WIN64 + ptr_ = nullptr; + return false; + } + if (zero) { + memset(ptr_, 0x0, size); + } + size_ = size; + return true; +} + +bool MappedMemory::Copy(size_t offset, const void *src, size_t size) +{ + assert(this->Allocated()); + assert(nullptr != src); + assert(0 < size); + memcpy(this->Address(offset), src, size); + return true; +} + +void MappedMemory::Free() +{ + assert(this->Allocated()); + HSA::hsa_memory_deregister(ptr_, size_); +#if defined(_WIN32) || defined(_WIN64) + VirtualFree(ptr_, size_, MEM_DECOMMIT); + VirtualFree(ptr_, 0, MEM_RELEASE); +#else + munmap(ptr_, size_); +#endif // _WIN32 || _WIN64 + ptr_ = nullptr; + size_ = 0; +} + +bool MappedMemory::Freeze() +{ + assert(this->Allocated()); + return true; +} + +class RegionMemory final: public SegmentMemory { +public: + static hsa_region_t AgentLocal(hsa_agent_t agent); + static hsa_region_t System(); + + RegionMemory(hsa_region_t region): SegmentMemory(), region_(region), ptr_(nullptr), host_ptr_(nullptr), size_(0) {} + ~RegionMemory() {} + + void* Address(size_t offset = 0) const override + { assert(this->Allocated()); return (char*)ptr_ + offset; } + void* HostAddress(size_t offset = 0) const override + { assert(this->Allocated()); return (char*)host_ptr_ + offset; } + bool Allocated() const override + { return nullptr != ptr_; } + + bool Allocate(size_t size, size_t align, bool zero) override; + bool Copy(size_t offset, const void *src, size_t size) override; + void Free() override; + bool Freeze() override; + +private: + RegionMemory(const RegionMemory&); + RegionMemory& operator=(const RegionMemory&); + + hsa_region_t region_; + void *ptr_; + void *host_ptr_; + size_t size_; +}; + +hsa_region_t RegionMemory::AgentLocal(hsa_agent_t agent) +{ + hsa_region_t invalid_region; invalid_region.handle = 0; + amd::GpuAgent *amd_agent = (amd::GpuAgent*)core::Agent::Convert(agent); + if (nullptr == amd_agent) { + return invalid_region; + } + auto agent_local_region = std::find_if(amd_agent->regions().begin(), amd_agent->regions().end(), IsLocalRegion); + return agent_local_region == amd_agent->regions().end() ? + invalid_region : core::MemoryRegion::Convert(*agent_local_region); +} + +hsa_region_t RegionMemory::System() { + const core::MemoryRegion* default_system_region = + core::Runtime::runtime_singleton_->system_regions_fine()[0]; + + assert(default_system_region != NULL); + + return core::MemoryRegion::Convert(default_system_region); +} + +bool RegionMemory::Allocate(size_t size, size_t align, bool zero) +{ + assert(!this->Allocated()); + assert(0 < size); + assert(0 < align && 0 == (align & (align - 1))); + if (HSA_STATUS_SUCCESS != HSA::hsa_memory_allocate(region_, size, &ptr_)) { + ptr_ = nullptr; + return false; + } + assert(0 == ((uintptr_t)ptr_) % align); + if (HSA_STATUS_SUCCESS != HSA::hsa_memory_allocate(RegionMemory::System(), size, &host_ptr_)) { + HSA::hsa_memory_free(ptr_); + ptr_ = nullptr; + host_ptr_ = nullptr; + return false; + } + if (zero) { + memset(host_ptr_, 0x0, size); + } + size_ = size; + return true; +} + +bool RegionMemory::Copy(size_t offset, const void *src, size_t size) +{ + assert(this->Allocated() && nullptr != host_ptr_); + assert(nullptr != src); + assert(0 < size); + memcpy((char*)host_ptr_ + offset, src, size); + return true; +} + +void RegionMemory::Free() +{ + assert(this->Allocated()); + HSA::hsa_memory_free(ptr_); + if (nullptr != host_ptr_) { + HSA::hsa_memory_free(host_ptr_); + } + ptr_ = nullptr; + host_ptr_ = nullptr; + size_ = 0; +} + +bool RegionMemory::Freeze() { + assert(this->Allocated() && nullptr != host_ptr_); + + core::Agent* agent = reinterpret_cast( + core::MemoryRegion::Convert(region_))->owner(); + if (agent != NULL && agent->device_type() == core::Agent::kAmdGpuDevice) { + if (HSA_STATUS_SUCCESS != agent->DmaCopy(ptr_, host_ptr_, size_)) { + return false; + } + } else { + memcpy(ptr_, host_ptr_, size_); + } + + return true; +} + +} // namespace anonymous + +namespace amd { + +hsa_isa_t LoaderContext::IsaFromName(const char *name) { + assert(name); + + hsa_status_t hsa_status = HSA_STATUS_SUCCESS; + hsa_isa_t isa_handle; + isa_handle.handle = 0; + + hsa_status = HSA::hsa_isa_from_name(name, &isa_handle); + if (HSA_STATUS_SUCCESS != hsa_status) { + isa_handle.handle = 0; + return isa_handle; + } + + return isa_handle; +} + +bool LoaderContext::IsaSupportedByAgent(hsa_agent_t agent, + hsa_isa_t code_object_isa) { + assert(agent.handle); + + hsa_status_t hsa_status = HSA_STATUS_SUCCESS; + hsa_isa_t agent_isa; + agent_isa.handle = 0; + + hsa_status = HSA::hsa_agent_get_info(agent, HSA_AGENT_INFO_ISA, &agent_isa); + if (HSA_STATUS_SUCCESS != hsa_status) { + return false; + } + + bool result = false; + + hsa_status = HSA::hsa_isa_compatible(code_object_isa, agent_isa, &result); + if (HSA_STATUS_SUCCESS != hsa_status) { + return false; + } + + return result; +} + +void* LoaderContext::SegmentAlloc(amdgpu_hsa_elf_segment_t segment, + hsa_agent_t agent, + size_t size, + size_t align, + bool zero) +{ + assert(0 < size); + assert(0 < align && 0 == (align & (align - 1))); + hsa_profile_t agent_profile; + if (HSA_STATUS_SUCCESS != HSA::hsa_agent_get_info(agent, HSA_AGENT_INFO_PROFILE, &agent_profile)) { + return nullptr; + } + SegmentMemory *mem = nullptr; + switch (segment) { + case AMDGPU_HSA_SEGMENT_GLOBAL_AGENT: + case AMDGPU_HSA_SEGMENT_READONLY_AGENT: + switch (agent_profile) { + case HSA_PROFILE_BASE: + mem = new (std::nothrow) RegionMemory(RegionMemory::AgentLocal(agent)); + break; + case HSA_PROFILE_FULL: + mem = new (std::nothrow) RegionMemory(RegionMemory::System()); + break; + default: + assert(false); + } + break; + case AMDGPU_HSA_SEGMENT_GLOBAL_PROGRAM: + switch (agent_profile) { + case HSA_PROFILE_BASE: + mem = new (std::nothrow) RegionMemory(RegionMemory::System()); + break; + case HSA_PROFILE_FULL: + mem = new (std::nothrow) MallocedMemory(); + break; + default: + assert(false); + } + break; + case AMDGPU_HSA_SEGMENT_CODE_AGENT: + switch (agent_profile) { + case HSA_PROFILE_BASE: + mem = new (std::nothrow) RegionMemory(IsDebuggerRegistered() ? + RegionMemory::System() : + RegionMemory::AgentLocal(agent)); + break; + case HSA_PROFILE_FULL: + mem = new (std::nothrow) MappedMemory(((GpuAgentInt*)core::Agent::Convert(agent))->is_kv_device()); + break; + default: + assert(false); + } + break; + default: + assert(false); + } + if (nullptr == mem) { + return nullptr; + } + mem->Allocate(size, align, zero); + return mem; +} + +bool LoaderContext::SegmentCopy(amdgpu_hsa_elf_segment_t segment, // not used. + hsa_agent_t agent, // not used. + void* dst, + size_t offset, + const void* src, + size_t size) +{ + assert(nullptr != dst); + return ((SegmentMemory*)dst)->Copy(offset, src, size); +} + +void LoaderContext::SegmentFree(amdgpu_hsa_elf_segment_t segment, // not used. + hsa_agent_t agent, // not used. + void* seg, + size_t size) // not used. +{ + assert(nullptr != seg); + SegmentMemory *mem = (SegmentMemory*)seg; + mem->Free(); + delete mem; + mem = nullptr; +} + +void* LoaderContext::SegmentAddress(amdgpu_hsa_elf_segment_t segment, // not used. + hsa_agent_t agent, // not used. + void* seg, + size_t offset) +{ + assert(nullptr != seg); + return ((SegmentMemory*)seg)->Address(offset); +} + +void* LoaderContext::SegmentHostAddress(amdgpu_hsa_elf_segment_t segment, // not used. + hsa_agent_t agent, // not used. + void* seg, + size_t offset) +{ + assert(nullptr != seg); + return ((SegmentMemory*)seg)->HostAddress(offset); +} + +bool LoaderContext::SegmentFreeze(amdgpu_hsa_elf_segment_t segment, // not used. + hsa_agent_t agent, // not used. + void* seg, + size_t size) // not used. +{ + assert(nullptr != seg); + return ((SegmentMemory*)seg)->Freeze(); +} + +bool LoaderContext::ImageExtensionSupported() { + hsa_status_t hsa_status = HSA_STATUS_SUCCESS; + bool result = false; + + hsa_status = + HSA::hsa_system_extension_supported(HSA_EXTENSION_IMAGES, 1, 0, &result); + if (HSA_STATUS_SUCCESS != hsa_status) { + return false; + } + + return result; +} + +hsa_status_t LoaderContext::ImageCreate( + hsa_agent_t agent, hsa_access_permission_t image_permission, + const hsa_ext_image_descriptor_t *image_descriptor, const void *image_data, + hsa_ext_image_t *image_handle) { + assert(agent.handle); + assert(image_descriptor); + assert(image_data); + assert(image_handle); + + assert(ImageExtensionSupported()); + + return hsa_ext_image_create(agent, image_descriptor, image_data, + image_permission, image_handle); +} + +hsa_status_t LoaderContext::ImageDestroy(hsa_agent_t agent, + hsa_ext_image_t image_handle) { + assert(agent.handle); + assert(image_handle.handle); + + assert(ImageExtensionSupported()); + + return hsa_ext_image_destroy(agent, image_handle); +} + +hsa_status_t LoaderContext::SamplerCreate( + hsa_agent_t agent, const hsa_ext_sampler_descriptor_t *sampler_descriptor, + hsa_ext_sampler_t *sampler_handle) { + assert(agent.handle); + assert(sampler_descriptor); + assert(sampler_handle); + + assert(ImageExtensionSupported()); + + return hsa_ext_sampler_create(agent, sampler_descriptor, sampler_handle); +} + +hsa_status_t LoaderContext::SamplerDestroy(hsa_agent_t agent, + hsa_ext_sampler_t sampler_handle) { + assert(agent.handle); + assert(sampler_handle.handle); + + assert(ImageExtensionSupported()); + + return hsa_ext_sampler_destroy(agent, sampler_handle); +} + +} // namespace amd diff --git a/runtime/hsa-runtime/core/runtime/amd_memory_region.cpp b/runtime/hsa-runtime/core/runtime/amd_memory_region.cpp new file mode 100644 index 0000000000..b3b4179247 --- /dev/null +++ b/runtime/hsa-runtime/core/runtime/amd_memory_region.cpp @@ -0,0 +1,555 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#include "core/inc/amd_memory_region.h" + +#include + +#include "core/inc/runtime.h" +#include "core/inc/amd_cpu_agent.h" +#include "core/inc/amd_gpu_agent.h" +#include "core/util/utils.h" + +namespace amd { +void* MemoryRegion::AllocateKfdMemory(const HsaMemFlags& flag, + HSAuint32 node_id, size_t size) { + void* ret = NULL; + const HSAKMT_STATUS status = hsaKmtAllocMemory(node_id, size, flag, &ret); + return (status == HSAKMT_STATUS_SUCCESS) ? ret : NULL; +} + +void MemoryRegion::FreeKfdMemory(void* ptr, size_t size) { + if (ptr == NULL || size == 0) { + return; + } + + HSAKMT_STATUS status = hsaKmtFreeMemory(ptr, size); + assert(status == HSAKMT_STATUS_SUCCESS); +} + +bool MemoryRegion::RegisterMemory(void* ptr, size_t size, size_t num_nodes, + const uint32_t* nodes) { + assert(ptr != NULL); + assert(size != 0); + assert(num_nodes != 0); + assert(nodes != NULL); + + const HSAKMT_STATUS status = hsaKmtRegisterMemoryToNodes( + ptr, size, num_nodes, const_cast(nodes)); + return (status == HSAKMT_STATUS_SUCCESS); +} + +void MemoryRegion::DeregisterMemory(void* ptr) { hsaKmtDeregisterMemory(ptr); } + +bool MemoryRegion::MakeKfdMemoryResident(size_t num_node, const uint32_t* nodes, + void* ptr, size_t size, + uint64_t* alternate_va, + HsaMemMapFlags map_flag) { + assert(num_node > 0); + assert(nodes != NULL); + + *alternate_va = 0; + const HSAKMT_STATUS status = + hsaKmtMapMemoryToGPUNodes(ptr, size, alternate_va, map_flag, num_node, + const_cast(nodes)); + + return (status == HSAKMT_STATUS_SUCCESS); +} + +void MemoryRegion::MakeKfdMemoryUnresident(void* ptr) { + hsaKmtUnmapMemoryToGPU(ptr); +} + +MemoryRegion::MemoryRegion(bool fine_grain, bool full_profile, + core::Agent* owner, + const HsaMemoryProperties& mem_props) + : core::MemoryRegion(fine_grain, full_profile, owner), + mem_props_(mem_props), + max_single_alloc_size_(0), + virtual_size_(0) { + virtual_size_ = GetPhysicalSize(); + + mem_flag_.Value = 0; + map_flag_.Value = 0; + + static const HSAuint64 kGpuVmSize = (1ULL << 40); + + if (IsLocalMemory()) { + mem_flag_.ui32.PageSize = HSA_PAGE_SIZE_4KB; + mem_flag_.ui32.NoSubstitute = 1; + mem_flag_.ui32.HostAccess = + (mem_props_.HeapType == HSA_HEAPTYPE_FRAME_BUFFER_PRIVATE) ? 0 : 1; + mem_flag_.ui32.NonPaged = 1; + + map_flag_.ui32.PageSize = HSA_PAGE_SIZE_4KB; + + virtual_size_ = kGpuVmSize; + } else if (IsSystem()) { + mem_flag_.ui32.PageSize = HSA_PAGE_SIZE_4KB; + mem_flag_.ui32.NoSubstitute = 1; + mem_flag_.ui32.HostAccess = 1; + mem_flag_.ui32.CachePolicy = HSA_CACHING_CACHED; + + map_flag_.ui32.HostAccess = 1; + map_flag_.ui32.PageSize = HSA_PAGE_SIZE_4KB; + + virtual_size_ = + (full_profile) ? os::GetUserModeVirtualMemorySize() : kGpuVmSize; + } + + max_single_alloc_size_ = + AlignDown(static_cast(GetPhysicalSize()), kPageSize_); + + mem_flag_.ui32.CoarseGrain = (fine_grain) ? 0 : 1; + + assert(GetVirtualSize() != 0); + assert(GetPhysicalSize() <= GetVirtualSize()); + assert(IsMultipleOf(max_single_alloc_size_, kPageSize_)); +} + +MemoryRegion::~MemoryRegion() {} + +hsa_status_t MemoryRegion::Allocate(size_t size, void** address) const { + return Allocate(false, size, address); +} + +hsa_status_t MemoryRegion::Allocate(bool restrict_access, size_t size, + void** address) const { + if (address == NULL) { + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + + if (!IsSystem() && !IsLocalMemory()) { + return HSA_STATUS_ERROR_INVALID_ALLOCATION; + } + + if (size > max_single_alloc_size_) { + return HSA_STATUS_ERROR_INVALID_ALLOCATION; + } + + size = AlignUp(size, kPageSize_); + + *address = AllocateKfdMemory(mem_flag_, owner()->node_id(), size); + + if (*address != NULL) { + // Commit the memory. + // For system memory, on non-restricted allocation, map it to all GPUs. On + // restricted allocation, only CPU is allowed to access by default, so + // no need to map + // For local memory, only map it to the owning GPU. Mapping to other GPU, + // if the access is allowed, is performed on AllowAccess. + HsaMemMapFlags map_flag = map_flag_; + size_t map_node_count = 1; + const uint32_t owner_node_id = owner()->node_id(); + const uint32_t* map_node_id = &owner_node_id; + + if (IsSystem()) { + if (!restrict_access) { + // Map to all GPU agents. + map_node_count = core::Runtime::runtime_singleton_->gpu_ids().size(); + + if (map_node_count == 0) { + // No need to pin since no GPU in the platform. + return HSA_STATUS_SUCCESS; + } + + map_node_id = &core::Runtime::runtime_singleton_->gpu_ids()[0]; + } else { + // No need to pin it for CPU exclusive access. + return HSA_STATUS_SUCCESS; + } + } + + uint64_t alternate_va = 0; + const bool is_resident = MakeKfdMemoryResident( + map_node_count, map_node_id, *address, size, &alternate_va, map_flag); + + const bool require_pinning = + (!full_profile() || IsLocalMemory() || IsScratch()); + + if (require_pinning && !is_resident) { + FreeKfdMemory(*address, size); + *address = NULL; + return HSA_STATUS_ERROR_OUT_OF_RESOURCES; + } + + return HSA_STATUS_SUCCESS; + } + + return HSA_STATUS_ERROR_OUT_OF_RESOURCES; +} + +hsa_status_t MemoryRegion::Free(void* address, size_t size) const { + MakeKfdMemoryUnresident(address); + + FreeKfdMemory(address, size); + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t MemoryRegion::GetInfo(hsa_region_info_t attribute, + void* value) const { + switch (attribute) { + case HSA_REGION_INFO_SEGMENT: + switch (mem_props_.HeapType) { + case HSA_HEAPTYPE_SYSTEM: + case HSA_HEAPTYPE_FRAME_BUFFER_PRIVATE: + case HSA_HEAPTYPE_FRAME_BUFFER_PUBLIC: + *((hsa_region_segment_t*)value) = HSA_REGION_SEGMENT_GLOBAL; + break; + case HSA_HEAPTYPE_GPU_LDS: + *((hsa_region_segment_t*)value) = HSA_REGION_SEGMENT_GROUP; + break; + default: + assert(false && "Memory region should only be global, group"); + break; + } + break; + case HSA_REGION_INFO_GLOBAL_FLAGS: + switch (mem_props_.HeapType) { + case HSA_HEAPTYPE_SYSTEM: + *((uint32_t*)value) = fine_grain() + ? (HSA_REGION_GLOBAL_FLAG_KERNARG | + HSA_REGION_GLOBAL_FLAG_FINE_GRAINED) + : HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED; + break; + case HSA_HEAPTYPE_FRAME_BUFFER_PRIVATE: + case HSA_HEAPTYPE_FRAME_BUFFER_PUBLIC: + *((uint32_t*)value) = HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED; + break; + default: + *((uint32_t*)value) = 0; + break; + } + break; + case HSA_REGION_INFO_SIZE: + switch (mem_props_.HeapType) { + case HSA_HEAPTYPE_FRAME_BUFFER_PRIVATE: + case HSA_HEAPTYPE_FRAME_BUFFER_PUBLIC: + *((size_t*)value) = static_cast(GetPhysicalSize()); + break; + default: + *((size_t*)value) = static_cast( + (full_profile()) ? GetVirtualSize() : GetPhysicalSize()); + break; + } + break; + case HSA_REGION_INFO_ALLOC_MAX_SIZE: + switch (mem_props_.HeapType) { + case HSA_HEAPTYPE_FRAME_BUFFER_PRIVATE: + case HSA_HEAPTYPE_FRAME_BUFFER_PUBLIC: + case HSA_HEAPTYPE_SYSTEM: + *((size_t*)value) = max_single_alloc_size_; + break; + default: + *((size_t*)value) = 0; + } + break; + case HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED: + switch (mem_props_.HeapType) { + case HSA_HEAPTYPE_SYSTEM: + case HSA_HEAPTYPE_FRAME_BUFFER_PRIVATE: + case HSA_HEAPTYPE_FRAME_BUFFER_PUBLIC: + *((bool*)value) = true; + break; + default: + *((bool*)value) = false; + break; + } + break; + case HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE: + switch (mem_props_.HeapType) { + case HSA_HEAPTYPE_SYSTEM: + case HSA_HEAPTYPE_FRAME_BUFFER_PRIVATE: + case HSA_HEAPTYPE_FRAME_BUFFER_PUBLIC: + *((size_t*)value) = kPageSize_; + break; + default: + *((size_t*)value) = 0; + break; + } + break; + case HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT: + switch (mem_props_.HeapType) { + case HSA_HEAPTYPE_SYSTEM: + case HSA_HEAPTYPE_FRAME_BUFFER_PRIVATE: + case HSA_HEAPTYPE_FRAME_BUFFER_PUBLIC: + *((size_t*)value) = kPageSize_; + break; + default: + *((size_t*)value) = 0; + break; + } + break; + default: + switch ((hsa_amd_region_info_t)attribute) { + case HSA_AMD_REGION_INFO_HOST_ACCESSIBLE: + *((bool*)value) = + (mem_props_.HeapType == HSA_HEAPTYPE_SYSTEM) ? true : false; + break; + case HSA_AMD_REGION_INFO_BASE: + *((void**)value) = reinterpret_cast(GetBaseAddress()); + break; + case HSA_AMD_REGION_INFO_BUS_WIDTH: + *((uint32_t*)value) = BusWidth(); + break; + case HSA_AMD_REGION_INFO_MAX_CLOCK_FREQUENCY: + *((uint32_t*)value) = MaxMemCloc(); + break; + default: + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + break; + } + break; + } + return HSA_STATUS_SUCCESS; +} + +hsa_status_t MemoryRegion::GetPoolInfo(hsa_amd_memory_pool_info_t attribute, + void* value) const { + switch (attribute) { + case HSA_AMD_MEMORY_POOL_INFO_SEGMENT: + case HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS: + case HSA_AMD_MEMORY_POOL_INFO_SIZE: + case HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED: + case HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE: + case HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT: + return GetInfo(static_cast(attribute), value); + break; + case HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL: + *((bool*)value) = IsSystem() ? true : false; + break; + default: + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t MemoryRegion::GetAgentPoolInfo( + const core::Agent& agent, hsa_amd_agent_memory_pool_info_t attribute, + void* value) const { + const uint32_t node_id_from = agent.node_id(); + const uint32_t node_id_to = owner()->node_id(); + + const core::Runtime::LinkInfo link_info = + core::Runtime::runtime_singleton_->GetLinkInfo(node_id_from, node_id_to); + + switch (attribute) { + case HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS: + /** + * --------------------------------------------------- + * | |CPU |GPU (owner)|GPU (peer) | + * --------------------------------------------------- + * |system memory |allowed |disallowed |disallowed | + * --------------------------------------------------- + * |fb private |never |allowed |never | + * --------------------------------------------------- + * |fb public |disallowed |allowed |disallowed | + * --------------------------------------------------- + * |others |never |allowed |never | + * --------------------------------------------------- + */ + *((hsa_amd_memory_pool_access_t*)value) = + (((IsSystem()) && + (agent.device_type() == core::Agent::kAmdCpuDevice)) || + (agent.node_id() == owner()->node_id())) + ? HSA_AMD_MEMORY_POOL_ACCESS_ALLOWED_BY_DEFAULT + : (IsSystem() || (IsPublic() && link_info.num_hop > 0)) + ? HSA_AMD_MEMORY_POOL_ACCESS_DISALLOWED_BY_DEFAULT + : HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED; + break; + case HSA_AMD_AGENT_MEMORY_POOL_INFO_NUM_LINK_HOPS: + *((uint32_t*)value) = link_info.num_hop; + case HSA_AMD_AGENT_MEMORY_POOL_INFO_LINK_INFO: + memset(value, 0, sizeof(hsa_amd_memory_pool_link_info_t)); + if (link_info.num_hop > 0) { + memcpy(value, &link_info.info, sizeof(hsa_amd_memory_pool_link_info_t)); + } + break; + default: + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + return HSA_STATUS_SUCCESS; +} + +hsa_status_t MemoryRegion::AllowAccess(uint32_t num_agents, + const hsa_agent_t* agents, + const void* ptr, size_t size) const { + if (num_agents == 0 || agents == NULL || ptr == NULL || size == 0) { + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + + if (!IsSystem() && !IsLocalMemory()) { + return HSA_STATUS_ERROR; + } + + bool cpu_in_list = false; + + std::vector whitelist_nodes; + for (uint32_t i = 0; i < num_agents; ++i) { + const core::Agent* agent = core::Agent::Convert(agents[i]); + if (agent == NULL || !agent->IsValid()) { + return HSA_STATUS_ERROR_INVALID_AGENT; + } + + if (agent->device_type() == core::Agent::kAmdGpuDevice) { + whitelist_nodes.push_back(agent->node_id()); + } else { + cpu_in_list = true; + } + } + + if (whitelist_nodes.size() == 0 && IsSystem()) { + assert(cpu_in_list); + // This is a system region and only CPU agents in the whitelist. + // No need to call map. + return HSA_STATUS_SUCCESS; + } + + // If this is a local memory region, the owning gpu always needs to be in + // the whitelist. + if (IsPublic() && + std::find(whitelist_nodes.begin(), whitelist_nodes.end(), + owner()->node_id()) == whitelist_nodes.end()) { + whitelist_nodes.push_back(owner()->node_id()); + } + + HsaMemMapFlags map_flag = map_flag_; + map_flag.ui32.HostAccess |= (cpu_in_list) ? 1 : 0; + + uint64_t alternate_va = 0; + return (amd::MemoryRegion::MakeKfdMemoryResident( + whitelist_nodes.size(), &whitelist_nodes[0], + const_cast(ptr), size, &alternate_va, map_flag)) + ? HSA_STATUS_SUCCESS + : HSA_STATUS_ERROR_OUT_OF_RESOURCES; +} + +hsa_status_t MemoryRegion::CanMigrate(const MemoryRegion& dst, + bool& result) const { + // TODO(bwicakso): not implemented yet. + result = false; + return HSA_STATUS_ERROR_OUT_OF_RESOURCES; +} + +hsa_status_t MemoryRegion::Migrate(uint32_t flag, const void* ptr) const { + // TODO(bwicakso): not implemented yet. + return HSA_STATUS_ERROR_OUT_OF_RESOURCES; +} + +hsa_status_t MemoryRegion::Lock(uint32_t num_agents, const hsa_agent_t* agents, + void* host_ptr, size_t size, + void** agent_ptr) const { + if (!IsSystem()) { + return HSA_STATUS_ERROR; + } + + if (full_profile()) { + // For APU, any host pointer is always accessible by the gpu. + *agent_ptr = host_ptr; + return HSA_STATUS_SUCCESS; + } + + std::vector whitelist_nodes; + if (num_agents == 0 || agents == NULL) { + // Map to all GPU agents. + whitelist_nodes = core::Runtime::runtime_singleton_->gpu_ids(); + } else { + for (int i = 0; i < num_agents; ++i) { + core::Agent* agent = core::Agent::Convert(agents[i]); + if (agent == NULL || !agent->IsValid()) { + return HSA_STATUS_ERROR_INVALID_AGENT; + } + + if (agent->device_type() == core::Agent::kAmdGpuDevice) { + whitelist_nodes.push_back(agent->node_id()); + } + } + } + + if (whitelist_nodes.size() == 0) { + // No GPU agents in the whitelist. So no need to register and map since the + // platform only has CPUs. + *agent_ptr = host_ptr; + return HSA_STATUS_SUCCESS; + } + + // Call kernel driver to register and pin the memory. + if (RegisterMemory(host_ptr, size, whitelist_nodes.size(), + &whitelist_nodes[0])) { + uint64_t alternate_va = 0; + if (MakeKfdMemoryResident(whitelist_nodes.size(), &whitelist_nodes[0], + host_ptr, size, &alternate_va, map_flag_)) { + assert(alternate_va != 0); + *agent_ptr = reinterpret_cast(alternate_va); + return HSA_STATUS_SUCCESS; + } + amd::MemoryRegion::DeregisterMemory(host_ptr); + return HSA_STATUS_ERROR_OUT_OF_RESOURCES; + } + + return HSA_STATUS_ERROR; +} + +hsa_status_t MemoryRegion::Unlock(void* host_ptr) const { + if (!IsSystem()) { + return HSA_STATUS_ERROR; + } + + if (full_profile()) { + return HSA_STATUS_SUCCESS; + } + + MakeKfdMemoryUnresident(host_ptr); + DeregisterMemory(host_ptr); + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t MemoryRegion::AssignAgent(void* ptr, size_t size, + const core::Agent& agent, + hsa_access_permission_t access) const { + return HSA_STATUS_SUCCESS; +} + +} // namespace diff --git a/runtime/hsa-runtime/core/runtime/amd_topology.cpp b/runtime/hsa-runtime/core/runtime/amd_topology.cpp new file mode 100644 index 0000000000..2e071f05ca --- /dev/null +++ b/runtime/hsa-runtime/core/runtime/amd_topology.cpp @@ -0,0 +1,210 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#include "core/inc/amd_topology.h" + +#include +#include +#include + +#include "hsakmt.h" + +#include "core/inc/runtime.h" +#include "core/inc/amd_cpu_agent.h" +#include "core/inc/amd_gpu_agent.h" +#include "core/inc/amd_memory_region.h" +#include "core/util/utils.h" + +namespace amd { +// Minimum acceptable KFD version numbers +static const uint kKfdVersionMajor = 0; +static const uint kKfdVersionMinor = 99; + +CpuAgent* DiscoverCpu(HSAuint32 node_id, HsaNodeProperties& node_prop) { + if (node_prop.NumCPUCores == 0) { + return NULL; + } + + CpuAgent* cpu = new CpuAgent(node_id, node_prop); + core::Runtime::runtime_singleton_->RegisterAgent(cpu); + + return cpu; +} + +GpuAgent* DiscoverGpu(HSAuint32 node_id, HsaNodeProperties& node_prop) { + if (node_prop.NumFComputeCores == 0) { + return NULL; + } + + GpuAgent* gpu = new GpuAgent(node_id, node_prop); + core::Runtime::runtime_singleton_->RegisterAgent(gpu); + + if (HSA_STATUS_SUCCESS != gpu->InitDma()) { + assert(false && "Fail init blit"); + delete gpu; + gpu = NULL; + } + + return gpu; +} + +void RegisterLinkInfo(uint32_t node_id, uint32_t num_link) { + // Register connectivity links for this agent to the runtime. + if (num_link == 0) { + return; + } + + std::vector links(num_link); + if (HSAKMT_STATUS_SUCCESS != + hsaKmtGetNodeIoLinkProperties(node_id, num_link, &links[0])) { + return; + } + + for (HsaIoLinkProperties io_link : links) { + // Populate link info with thunk property. + hsa_amd_memory_pool_link_info_t link_info = {0}; + + if (io_link.Flags.ui32.Override == 1) { + if (io_link.Flags.ui32.NoPeerToPeerDMA == 1) { + // Ignore this link since peer to peer is not allowed. + continue; + } + link_info.atomic_support_32bit = (io_link.Flags.ui32.NoAtomics32bit == 0); + link_info.atomic_support_64bit = (io_link.Flags.ui32.NoAtomics64bit == 0); + link_info.coherent_support = (io_link.Flags.ui32.NonCoherent == 0); + } else { + // TODO(bwicakso): decipher HSA_IOLINKTYPE to fill out the atomic + // and coherent information. + } + + switch (io_link.IoLinkType) { + case HSA_IOLINKTYPE_HYPERTRANSPORT: + link_info.link_type = HSA_AMD_LINK_INFO_TYPE_HYPERTRANSPORT; + break; + case HSA_IOLINKTYPE_PCIEXPRESS: + link_info.link_type = HSA_AMD_LINK_INFO_TYPE_PCIE; + break; + case HSA_IOLINK_TYPE_QPI_1_1: + link_info.link_type = HSA_AMD_LINK_INFO_TYPE_QPI; + break; + case HSA_IOLINK_TYPE_INFINIBAND: + link_info.link_type = HSA_AMD_LINK_INFO_TYPE_INFINBAND; + break; + default: + break; + } + + link_info.max_bandwidth = io_link.MaximumBandwidth; + link_info.max_latency = io_link.MaximumLatency; + link_info.min_bandwidth = io_link.MinimumBandwidth; + link_info.min_latency = io_link.MinimumLatency; + + core::Runtime::runtime_singleton_->RegisterLinkInfo( + io_link.NodeFrom, io_link.NodeTo, io_link.Weight, link_info); + } +} + +/// @brief Calls Kfd thunk to get the snapshot of the topology of the system, +/// which includes associations between, node, devices, memory and caches. +void BuildTopology() { + HsaVersionInfo info; + if (hsaKmtGetVersion(&info) != HSAKMT_STATUS_SUCCESS) { + return; + } + + if (info.KernelInterfaceMajorVersion == kKfdVersionMajor && + info.KernelInterfaceMinorVersion < kKfdVersionMinor) { + return; + } + + // Disable KFD event support when using open source KFD + if (info.KernelInterfaceMajorVersion == 1 && + info.KernelInterfaceMinorVersion == 0) { + core::g_use_interrupt_wait = false; + } + + HsaSystemProperties props; + hsaKmtReleaseSystemProperties(); + + if (hsaKmtAcquireSystemProperties(&props) != HSAKMT_STATUS_SUCCESS) { + return; + } + + core::Runtime::runtime_singleton_->SetLinkCount(props.NumNodes); + + // Discover agents on every node in the platform. + for (HSAuint32 node_id = 0; node_id < props.NumNodes; node_id++) { + HsaNodeProperties node_prop = {0}; + if (hsaKmtGetNodeProperties(node_id, &node_prop) != HSAKMT_STATUS_SUCCESS) { + continue; + } + + const CpuAgent* cpu = DiscoverCpu(node_id, node_prop); + const GpuAgent* gpu = DiscoverGpu(node_id, node_prop); + + assert(!(cpu == NULL && gpu == NULL)); + + RegisterLinkInfo(node_id, node_prop.NumIOLinks); + } +} + +bool Load() { + // Open connection to kernel driver. + if (hsaKmtOpenKFD() != HSAKMT_STATUS_SUCCESS) { + return false; + } + + // Build topology table. + BuildTopology(); + + return true; +} + +bool Unload() { + hsaKmtReleaseSystemProperties(); + + // Close connection to kernel driver. + hsaKmtCloseKFD(); + + return true; +} +} // namespace amd diff --git a/runtime/hsa-runtime/core/runtime/default_signal.cpp b/runtime/hsa-runtime/core/runtime/default_signal.cpp new file mode 100644 index 0000000000..9b81de360b --- /dev/null +++ b/runtime/hsa-runtime/core/runtime/default_signal.cpp @@ -0,0 +1,275 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#include "core/inc/default_signal.h" +#include "core/util/timer.h" + +namespace core { + +int DefaultSignal::rtti_id_ = 0; + +DefaultSignal::DefaultSignal(hsa_signal_value_t initial_value) + : Signal(initial_value) { + signal_.kind = AMD_SIGNAL_KIND_USER; + signal_.event_mailbox_ptr = NULL; + HSA::hsa_memory_register(this, sizeof(DefaultSignal)); +} + +DefaultSignal::~DefaultSignal() { + invalid_ = true; + while (InUse()) + ; + HSA::hsa_memory_deregister(this, sizeof(DefaultSignal)); +} + +hsa_signal_value_t DefaultSignal::LoadRelaxed() { + return hsa_signal_value_t( + atomic::Load(&signal_.value, std::memory_order_relaxed)); +} + +hsa_signal_value_t DefaultSignal::LoadAcquire() { + return hsa_signal_value_t( + atomic::Load(&signal_.value, std::memory_order_acquire)); +} + +void DefaultSignal::StoreRelaxed(hsa_signal_value_t value) { + atomic::Store(&signal_.value, int64_t(value), std::memory_order_relaxed); +} + +void DefaultSignal::StoreRelease(hsa_signal_value_t value) { + atomic::Store(&signal_.value, int64_t(value), std::memory_order_release); +} + +hsa_signal_value_t DefaultSignal::WaitRelaxed(hsa_signal_condition_t condition, + hsa_signal_value_t compare_value, + uint64_t timeout, + hsa_wait_state_t wait_hint) { + atomic::Increment(&waiting_); + MAKE_SCOPE_GUARD([&]() { atomic::Decrement(&waiting_); }); + bool condition_met = false; + int64_t value; + + assert(!g_use_interrupt_wait && "Use of non-host signal in host signal wait API."); + + timer::fast_clock::time_point start_time, time; + start_time = timer::fast_clock::now(); + + uint64_t hsa_freq; + HSA::hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &hsa_freq); + const timer::fast_clock::duration fast_timeout = + timer::duration_from_seconds( + double(timeout) / double(hsa_freq)); + + while (true) { + if (invalid_) return 0; + + value = atomic::Load(&signal_.value, std::memory_order_relaxed); + + switch (condition) { + case HSA_SIGNAL_CONDITION_EQ: { + condition_met = (value == compare_value); + break; + } + case HSA_SIGNAL_CONDITION_NE: { + condition_met = (value != compare_value); + break; + } + case HSA_SIGNAL_CONDITION_GTE: { + condition_met = (value >= compare_value); + break; + } + case HSA_SIGNAL_CONDITION_LT: { + condition_met = (value < compare_value); + break; + } + default: + return 0; + } + if (condition_met) return hsa_signal_value_t(value); + + time = timer::fast_clock::now(); + if (time - start_time > fast_timeout) { + value = atomic::Load(&signal_.value, std::memory_order_relaxed); + return hsa_signal_value_t(value); + } + } +} + +hsa_signal_value_t DefaultSignal::WaitAcquire(hsa_signal_condition_t condition, + hsa_signal_value_t compare_value, + uint64_t timeout, + hsa_wait_state_t wait_hint) { + hsa_signal_value_t ret = + WaitRelaxed(condition, compare_value, timeout, wait_hint); + std::atomic_thread_fence(std::memory_order_acquire); + return ret; +} + +void DefaultSignal::AndRelaxed(hsa_signal_value_t value) { + atomic::And(&signal_.value, int64_t(value), std::memory_order_relaxed); +} + +void DefaultSignal::AndAcquire(hsa_signal_value_t value) { + atomic::And(&signal_.value, int64_t(value), std::memory_order_acquire); +} + +void DefaultSignal::AndRelease(hsa_signal_value_t value) { + atomic::And(&signal_.value, int64_t(value), std::memory_order_release); +} + +void DefaultSignal::AndAcqRel(hsa_signal_value_t value) { + atomic::And(&signal_.value, int64_t(value), std::memory_order_acq_rel); +} + +void DefaultSignal::OrRelaxed(hsa_signal_value_t value) { + atomic::Or(&signal_.value, int64_t(value), std::memory_order_relaxed); +} + +void DefaultSignal::OrAcquire(hsa_signal_value_t value) { + atomic::Or(&signal_.value, int64_t(value), std::memory_order_acquire); +} + +void DefaultSignal::OrRelease(hsa_signal_value_t value) { + atomic::Or(&signal_.value, int64_t(value), std::memory_order_release); +} + +void DefaultSignal::OrAcqRel(hsa_signal_value_t value) { + atomic::Or(&signal_.value, int64_t(value), std::memory_order_acq_rel); +} + +void DefaultSignal::XorRelaxed(hsa_signal_value_t value) { + atomic::Xor(&signal_.value, int64_t(value), std::memory_order_relaxed); +} + +void DefaultSignal::XorAcquire(hsa_signal_value_t value) { + atomic::Xor(&signal_.value, int64_t(value), std::memory_order_acquire); +} + +void DefaultSignal::XorRelease(hsa_signal_value_t value) { + atomic::Xor(&signal_.value, int64_t(value), std::memory_order_release); +} + +void DefaultSignal::XorAcqRel(hsa_signal_value_t value) { + atomic::Xor(&signal_.value, int64_t(value), std::memory_order_acq_rel); +} + +void DefaultSignal::AddRelaxed(hsa_signal_value_t value) { + atomic::Add(&signal_.value, int64_t(value), std::memory_order_relaxed); +} + +void DefaultSignal::AddAcquire(hsa_signal_value_t value) { + atomic::Add(&signal_.value, int64_t(value), std::memory_order_acquire); +} + +void DefaultSignal::AddRelease(hsa_signal_value_t value) { + atomic::Add(&signal_.value, int64_t(value), std::memory_order_release); +} + +void DefaultSignal::AddAcqRel(hsa_signal_value_t value) { + atomic::Add(&signal_.value, int64_t(value), std::memory_order_acq_rel); +} + +void DefaultSignal::SubRelaxed(hsa_signal_value_t value) { + atomic::Sub(&signal_.value, int64_t(value), std::memory_order_relaxed); +} + +void DefaultSignal::SubAcquire(hsa_signal_value_t value) { + atomic::Sub(&signal_.value, int64_t(value), std::memory_order_acquire); +} + +void DefaultSignal::SubRelease(hsa_signal_value_t value) { + atomic::Sub(&signal_.value, int64_t(value), std::memory_order_release); +} + +void DefaultSignal::SubAcqRel(hsa_signal_value_t value) { + atomic::Sub(&signal_.value, int64_t(value), std::memory_order_acq_rel); +} + +hsa_signal_value_t DefaultSignal::ExchRelaxed(hsa_signal_value_t value) { + return hsa_signal_value_t(atomic::Exchange(&signal_.value, int64_t(value), + std::memory_order_relaxed)); +} + +hsa_signal_value_t DefaultSignal::ExchAcquire(hsa_signal_value_t value) { + return hsa_signal_value_t(atomic::Exchange(&signal_.value, int64_t(value), + std::memory_order_acquire)); +} + +hsa_signal_value_t DefaultSignal::ExchRelease(hsa_signal_value_t value) { + return hsa_signal_value_t(atomic::Exchange(&signal_.value, int64_t(value), + std::memory_order_release)); +} + +hsa_signal_value_t DefaultSignal::ExchAcqRel(hsa_signal_value_t value) { + return hsa_signal_value_t(atomic::Exchange(&signal_.value, int64_t(value), + std::memory_order_acq_rel)); +} + +hsa_signal_value_t DefaultSignal::CasRelaxed(hsa_signal_value_t expected, + hsa_signal_value_t value) { + return hsa_signal_value_t(atomic::Cas(&signal_.value, int64_t(value), + int64_t(expected), + std::memory_order_relaxed)); +} + +hsa_signal_value_t DefaultSignal::CasAcquire(hsa_signal_value_t expected, + hsa_signal_value_t value) { + return hsa_signal_value_t(atomic::Cas(&signal_.value, int64_t(value), + int64_t(expected), + std::memory_order_acquire)); +} + +hsa_signal_value_t DefaultSignal::CasRelease(hsa_signal_value_t expected, + hsa_signal_value_t value) { + return hsa_signal_value_t(atomic::Cas(&signal_.value, int64_t(value), + int64_t(expected), + std::memory_order_release)); +} + +hsa_signal_value_t DefaultSignal::CasAcqRel(hsa_signal_value_t expected, + hsa_signal_value_t value) { + return hsa_signal_value_t(atomic::Cas(&signal_.value, int64_t(value), + int64_t(expected), + std::memory_order_acq_rel)); +} + +} // namespace core diff --git a/runtime/hsa-runtime/core/runtime/host_queue.cpp b/runtime/hsa-runtime/core/runtime/host_queue.cpp new file mode 100644 index 0000000000..3803b6508c --- /dev/null +++ b/runtime/hsa-runtime/core/runtime/host_queue.cpp @@ -0,0 +1,99 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#include "core/inc/host_queue.h" + +#include "core/inc/runtime.h" +#include "core/util/utils.h" + +namespace core { +HostQueue::HostQueue(hsa_region_t region, uint32_t ring_size, + hsa_queue_type_t type, uint32_t features, + hsa_signal_t doorbell_signal) + : Queue(), + size_(ring_size), + active_(false) { + if (!Shared::IsSharedObjectAllocationValid()) { + return; + } + + HSA::hsa_memory_register(this, sizeof(HostQueue)); + + const size_t queue_buffer_size = size_ * sizeof(AqlPacket); + if (HSA_STATUS_SUCCESS != + HSA::hsa_memory_allocate(region, queue_buffer_size, &ring_)) { + return; + } + + assert(IsMultipleOf(ring_, kRingAlignment)); + assert(ring_ != NULL); + + amd_queue_.hsa_queue.base_address = ring_; + amd_queue_.hsa_queue.size = size_; + amd_queue_.hsa_queue.doorbell_signal = doorbell_signal; + amd_queue_.hsa_queue.id = Runtime::runtime_singleton_->GetQueueId(); + amd_queue_.hsa_queue.type = type; + amd_queue_.hsa_queue.features = features; +#ifdef HSA_LARGE_MODEL + AMD_HSA_BITS_SET( + amd_queue_.queue_properties, AMD_QUEUE_PROPERTIES_IS_PTR64, 1); +#else + AMD_HSA_BITS_SET( + amd_queue_.queue_properties, AMD_QUEUE_PROPERTIES_IS_PTR64, 0); +#endif + amd_queue_.write_dispatch_id = amd_queue_.read_dispatch_id = 0; + AMD_HSA_BITS_SET( + amd_queue_.queue_properties, AMD_QUEUE_PROPERTIES_ENABLE_PROFILING, 0); + + active_ = true; +} + +HostQueue::~HostQueue() { + if (!Shared::IsSharedObjectAllocationValid()) { + return; + } + + HSA::hsa_memory_free(ring_); + HSA::hsa_memory_deregister(this, sizeof(HostQueue)); +} + +} // namespace core diff --git a/runtime/hsa-runtime/core/runtime/hsa.cpp b/runtime/hsa-runtime/core/runtime/hsa.cpp new file mode 100644 index 0000000000..7683eabb08 --- /dev/null +++ b/runtime/hsa-runtime/core/runtime/hsa.cpp @@ -0,0 +1,1710 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +// HSA C to C++ interface implementation. +// This file does argument checking and conversion to C++. +#include +#include + +#include "core/inc/runtime.h" +#include "core/inc/agent.h" +#include "core/inc/host_queue.h" +#include "core/inc/isa.h" +#include "core/inc/memory_region.h" +#include "core/inc/queue.h" +#include "core/inc/signal.h" +#include "core/inc/default_signal.h" +#include "core/inc/interrupt_signal.h" +#include "core/inc/amd_load_map.h" +#include "core/inc/amd_loader_context.hpp" + +using namespace amd::hsa::code; + +template +struct ValidityError; +template <> +struct ValidityError { + enum { kValue = HSA_STATUS_ERROR_INVALID_SIGNAL }; +}; +template <> +struct ValidityError { + enum { kValue = HSA_STATUS_ERROR_INVALID_AGENT }; +}; +template <> +struct ValidityError { + enum { kValue = HSA_STATUS_ERROR_INVALID_REGION }; +}; +template <> +struct ValidityError { + enum { kValue = HSA_STATUS_ERROR_INVALID_QUEUE }; +}; +template <> +struct ValidityError { + enum { kValue = HSA_STATUS_ERROR_INVALID_ISA }; +}; +template +struct ValidityError { + enum { kValue = ValidityError::kValue }; +}; + +#define IS_BAD_PTR(ptr) \ + do { \ + if ((ptr) == NULL) return HSA_STATUS_ERROR_INVALID_ARGUMENT; \ + } while (false) +#define IS_BAD_PROFILE(profile) \ + do { \ + if (profile != HSA_PROFILE_BASE && \ + profile != HSA_PROFILE_FULL) { \ + return HSA_STATUS_ERROR_INVALID_ARGUMENT; \ + } \ + } while (false) +#define IS_VALID(ptr) \ + do { \ + if (((ptr) == NULL) || !((ptr)->IsValid())) \ + return hsa_status_t(ValidityError::kValue); \ + } while (false) +#define CHECK_ALLOC(ptr) \ + do { \ + if ((ptr) == NULL) return HSA_STATUS_ERROR_OUT_OF_RESOURCES; \ + } while (false) +#define IS_OPEN() \ + do { \ + if (!core::Runtime::runtime_singleton_->IsOpen()) \ + return HSA_STATUS_ERROR_NOT_INITIALIZED; \ + } while (false) + +template +static __forceinline bool IsValid(T* ptr) { + return (ptr == NULL) ? NULL : ptr->IsValid(); +} + +//----------------------------------------------------------------------------- +// Basic Checks +//----------------------------------------------------------------------------- +static_assert(sizeof(hsa_barrier_and_packet_t) == + sizeof(hsa_kernel_dispatch_packet_t), + "AQL packet definitions have wrong sizes!"); +static_assert(sizeof(hsa_barrier_and_packet_t) == + sizeof(hsa_agent_dispatch_packet_t), + "AQL packet definitions have wrong sizes!"); +static_assert(sizeof(hsa_barrier_and_packet_t) == 64, + "AQL packet definitions have wrong sizes!"); +static_assert(sizeof(hsa_barrier_and_packet_t) == + sizeof(hsa_barrier_or_packet_t), + "AQL packet definitions have wrong sizes!"); +#ifdef HSA_LARGE_MODEL +static_assert(sizeof(void*) == 8, "HSA_LARGE_MODEL is set incorrectly!"); +#else +static_assert(sizeof(void*) == 4, "HSA_LARGE_MODEL is set incorrectly!"); +#endif + +namespace HSA { + +//---------------------------------------------------------------------------// +// Init/Shutdown routines +//---------------------------------------------------------------------------// +hsa_status_t hsa_init() { + if (core::Runtime::runtime_singleton_->Acquire()) return HSA_STATUS_SUCCESS; + return HSA_STATUS_ERROR_REFCOUNT_OVERFLOW; +} + +hsa_status_t hsa_shut_down() { + IS_OPEN(); + if (core::Runtime::runtime_singleton_->Release()) return HSA_STATUS_SUCCESS; + return HSA_STATUS_ERROR_NOT_INITIALIZED; +} + +//---------------------------------------------------------------------------// +// System +//---------------------------------------------------------------------------// +hsa_status_t + hsa_system_get_info(hsa_system_info_t attribute, void* value) { + IS_OPEN(); + IS_BAD_PTR(value); + return core::Runtime::runtime_singleton_->GetSystemInfo(attribute, value); +} + +hsa_status_t + hsa_system_extension_supported(uint16_t extension, uint16_t version_major, + uint16_t version_minor, bool* result) { + IS_OPEN(); + + if ((extension > HSA_EXTENSION_AMD_PROFILER && + extension != AMD_EXTENSION_LOAD_MAP) || result == NULL) { + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + + *result = false; + + uint16_t system_version_major = 0; + hsa_status_t status = core::Runtime::runtime_singleton_->GetSystemInfo( + HSA_SYSTEM_INFO_VERSION_MAJOR, &system_version_major); + assert(status == HSA_STATUS_SUCCESS); + + if (version_major <= system_version_major) { + uint16_t system_version_minor = 0; + status = core::Runtime::runtime_singleton_->GetSystemInfo( + HSA_SYSTEM_INFO_VERSION_MINOR, &system_version_minor); + assert(status == HSA_STATUS_SUCCESS); + + if (version_minor <= system_version_minor) { + *result = true; + } + } + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t + hsa_system_get_extension_table(uint16_t extension, uint16_t version_major, + uint16_t version_minor, void* table) { + if (table == NULL) { + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + + IS_OPEN(); + + bool supported = false; + hsa_status_t status = hsa_system_extension_supported( + extension, version_major, version_minor, &supported); + + if (HSA_STATUS_SUCCESS != status) { + return status; + } + + if (supported) { + ExtTable& runtime_ext_table = + core::Runtime::runtime_singleton_->extensions_.table; + + if (extension == HSA_EXTENSION_IMAGES) { + // Currently there is only version 1.00. + hsa_ext_images_1_00_pfn_t* ext_table = + reinterpret_cast(table); + ext_table->hsa_ext_image_clear = hsa_ext_image_clear; + ext_table->hsa_ext_image_copy = hsa_ext_image_copy; + ext_table->hsa_ext_image_create = hsa_ext_image_create; + ext_table->hsa_ext_image_data_get_info = hsa_ext_image_data_get_info; + ext_table->hsa_ext_image_destroy = hsa_ext_image_destroy; + ext_table->hsa_ext_image_export = hsa_ext_image_export; + ext_table->hsa_ext_image_get_capability = hsa_ext_image_get_capability; + ext_table->hsa_ext_image_import = hsa_ext_image_import; + ext_table->hsa_ext_sampler_create = hsa_ext_sampler_create; + ext_table->hsa_ext_sampler_destroy = hsa_ext_sampler_destroy; + + return HSA_STATUS_SUCCESS; + } else if (extension == HSA_EXTENSION_FINALIZER) { + // Currently there is only version 1.00. + hsa_ext_finalizer_1_00_pfn_s* ext_table = + reinterpret_cast(table); + ext_table->hsa_ext_program_add_module = hsa_ext_program_add_module; + ext_table->hsa_ext_program_create = hsa_ext_program_create; + ext_table->hsa_ext_program_destroy = hsa_ext_program_destroy; + ext_table->hsa_ext_program_finalize = hsa_ext_program_finalize; + ext_table->hsa_ext_program_get_info = hsa_ext_program_get_info; + ext_table->hsa_ext_program_iterate_modules = + hsa_ext_program_iterate_modules; + + return HSA_STATUS_SUCCESS; + } else if (extension == AMD_EXTENSION_LOAD_MAP) { + // Currently there is only version 1.00. + amd_load_map_1_00_pfn_t* amd_table = + reinterpret_cast(table); + amd_table->amd_executable_load_code_object = amd_executable_load_code_object; + amd_table->amd_iterate_executables = amd_iterate_executables; + amd_table->amd_executable_iterate_loaded_code_objects = amd_executable_iterate_loaded_code_objects; + amd_table->amd_loaded_code_object_get_info = amd_loaded_code_object_get_info; + amd_table->amd_loaded_code_object_iterate_loaded_segments = amd_loaded_code_object_iterate_loaded_segments; + amd_table->amd_loaded_segment_get_info = amd_loaded_segment_get_info; + } else { + // TODO: other extensions are not yet implemented. + return HSA_STATUS_ERROR; + } + } + + return HSA_STATUS_SUCCESS; +} + +//---------------------------------------------------------------------------// +// Agent +//---------------------------------------------------------------------------// +hsa_status_t + hsa_iterate_agents(hsa_status_t (*callback)(hsa_agent_t agent, void* data), + void* data) { + IS_OPEN(); + IS_BAD_PTR(callback); + return core::Runtime::runtime_singleton_->IterateAgent(callback, data); +} + +hsa_status_t hsa_agent_get_info(hsa_agent_t agent_handle, + hsa_agent_info_t attribute, + void* value) { + IS_OPEN(); + IS_BAD_PTR(value); + const core::Agent* agent = core::Agent::Convert(agent_handle); + IS_VALID(agent); + return agent->GetInfo(attribute, value); +} + +hsa_status_t hsa_agent_get_exception_policies(hsa_agent_t agent_handle, + hsa_profile_t profile, + uint16_t* mask) { + IS_OPEN(); + IS_BAD_PTR(mask); + IS_BAD_PROFILE(profile); + const core::Agent* agent = core::Agent::Convert(agent_handle); + IS_VALID(agent); + + // TODO: Fix me when exception policies are supported. + *mask = 0; + return HSA_STATUS_SUCCESS; +} + +hsa_status_t + hsa_agent_extension_supported(uint16_t extension, hsa_agent_t agent_handle, + uint16_t version_major, + uint16_t version_minor, bool* result) { + IS_OPEN(); + + if ((result == NULL) || (extension > HSA_EXTENSION_AMD_PROFILER)) { + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + + *result = false; + + const core::Agent* agent = core::Agent::Convert(agent_handle); + IS_VALID(agent); + + if (agent->device_type() == core::Agent::kAmdGpuDevice) { + uint16_t agent_version_major = 0; + hsa_status_t status = + agent->GetInfo(HSA_AGENT_INFO_VERSION_MAJOR, &agent_version_major); + assert(status == HSA_STATUS_SUCCESS); + + if (version_major <= agent_version_major) { + uint16_t agent_version_minor = 0; + status = + agent->GetInfo(HSA_AGENT_INFO_VERSION_MINOR, &agent_version_minor); + assert(status == HSA_STATUS_SUCCESS); + + if (version_minor <= agent_version_minor) { + *result = true; + } + } + } + + return HSA_STATUS_SUCCESS; +} + +/// @brief Api to create a user mode queue. +/// +/// @param agent Hsa Agent which will execute Aql commands +/// +/// @param size Size of Queue in terms of Aql packet size +/// +/// @param type of Queue Single Writer or Multiple Writer +/// +/// @param callback Callback function to register in case Quee +/// encounters an error +/// +/// @param service_queue Pointer to a service queue +/// +/// @param queue Output parameter updated with a pointer to the +/// queue being created +/// +/// @return hsa_status +hsa_status_t hsa_queue_create( + hsa_agent_t agent_handle, uint32_t size, hsa_queue_type_t type, + void (*callback)(hsa_status_t status, hsa_queue_t* source, void* data), + void* data, uint32_t private_segment_size, uint32_t group_segment_size, + hsa_queue_t** queue) { + IS_OPEN(); + + if ((queue == NULL) || (size == 0) || (!IsPowerOfTwo(size)) || + (type < HSA_QUEUE_TYPE_MULTI) || (type > HSA_QUEUE_TYPE_SINGLE)) { + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + + core::Agent* agent = core::Agent::Convert(agent_handle); + IS_VALID(agent); + + hsa_queue_type_t agent_queue_type = HSA_QUEUE_TYPE_MULTI; + hsa_status_t status = + agent->GetInfo(HSA_AGENT_INFO_QUEUE_TYPE, &agent_queue_type); + assert(HSA_STATUS_SUCCESS == status); + + if (agent_queue_type == HSA_QUEUE_TYPE_SINGLE && + type != HSA_QUEUE_TYPE_SINGLE) { + return HSA_STATUS_ERROR_INVALID_QUEUE_CREATION; + } + + // TODO: private_segment_size and group_segment_size. + core::Queue* cmd_queue = NULL; + status = agent->QueueCreate(size, type, callback, data, private_segment_size, + group_segment_size, &cmd_queue); + if (cmd_queue != NULL) { + *queue = core::Queue::Convert(cmd_queue); + if (*queue == NULL) { + delete cmd_queue; + return HSA_STATUS_ERROR_OUT_OF_RESOURCES; + } + } else { + *queue = NULL; + } + + return status; +} + +hsa_status_t hsa_soft_queue_create(hsa_region_t region, uint32_t size, + hsa_queue_type_t type, uint32_t features, + hsa_signal_t doorbell_signal, + hsa_queue_t** queue) { + IS_OPEN(); + + if ((queue == NULL) || (region.handle == 0) || + (doorbell_signal.handle == 0) || (size == 0) || (!IsPowerOfTwo(size)) || + (type < HSA_QUEUE_TYPE_MULTI) || (type > HSA_QUEUE_TYPE_SINGLE) || + (features == 0)) { + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + + const core::MemoryRegion* mem_region = core::MemoryRegion::Convert(region); + IS_VALID(mem_region); + + const core::Signal* signal = core::Signal::Convert(doorbell_signal); + IS_VALID(signal); + + core::HostQueue* host_queue = + new core::HostQueue(region, size, type, features, doorbell_signal); + + if (!host_queue->active()) { + delete host_queue; + return HSA_STATUS_ERROR_OUT_OF_RESOURCES; + } + + *queue = core::Queue::Convert(host_queue); + + return HSA_STATUS_SUCCESS; +} + +/// @brief Api to destroy a user mode queue +/// +/// @param queue Pointer to the queue being destroyed +/// +/// @return hsa_status +hsa_status_t hsa_queue_destroy(hsa_queue_t* queue) { + IS_OPEN(); + IS_BAD_PTR(queue); + core::Queue* cmd_queue = core::Queue::Convert(queue); + IS_VALID(cmd_queue); + delete cmd_queue; + return HSA_STATUS_SUCCESS; +} + +/// @brief Api to inactivate a user mode queue +/// +/// @param queue Pointer to the queue being inactivated +/// +/// @return hsa_status +hsa_status_t hsa_queue_inactivate(hsa_queue_t* queue) { + IS_OPEN(); + IS_BAD_PTR(queue); + core::Queue* cmd_queue = core::Queue::Convert(queue); + IS_VALID(cmd_queue); + cmd_queue->Inactivate(); + return HSA_STATUS_SUCCESS; +} + +/// @brief Api to read the Read Index of Queue using Acquire semantics +/// +/// @param queue Pointer to the queue whose read index is being read +/// +/// @return uint64_t Value of Read index +uint64_t hsa_queue_load_read_index_acquire(const hsa_queue_t* queue) { + core::Queue* cmd_queue = core::Queue::Convert(queue); + assert(IsValid(cmd_queue)); + return cmd_queue->LoadReadIndexAcquire(); +} + +/// @brief Api to read the Read Index of Queue using Relaxed semantics +/// +/// @param queue Pointer to the queue whose read index is being read +/// +/// @return uint64_t Value of Read index +uint64_t hsa_queue_load_read_index_relaxed(const hsa_queue_t* queue) { + core::Queue* cmd_queue = core::Queue::Convert(queue); + assert(IsValid(cmd_queue)); + return cmd_queue->LoadReadIndexRelaxed(); +} + +/// @brief Api to read the Write Index of Queue using Acquire semantics +/// +/// @param queue Pointer to the queue whose write index is being read +/// +/// @return uint64_t Value of Write index +uint64_t hsa_queue_load_write_index_acquire(const hsa_queue_t* queue) { + core::Queue* cmd_queue = core::Queue::Convert(queue); + assert(IsValid(cmd_queue)); + return cmd_queue->LoadWriteIndexAcquire(); +} + +/// @brief Api to read the Write Index of Queue using Relaxed semantics +/// +/// @param queue Pointer to the queue whose write index is being read +/// +/// @return uint64_t Value of Write index +uint64_t hsa_queue_load_write_index_relaxed(const hsa_queue_t* queue) { + core::Queue* cmd_queue = core::Queue::Convert(queue); + assert(IsValid(cmd_queue)); + return cmd_queue->LoadWriteIndexAcquire(); +} + +/// @brief Api to store the Read Index of Queue using Relaxed semantics +/// +/// @param queue Pointer to the queue whose read index is being updated +/// +/// @param value Value of new read index +void hsa_queue_store_read_index_relaxed(const hsa_queue_t* queue, + uint64_t value) { + core::Queue* cmd_queue = core::Queue::Convert(queue); + assert(IsValid(cmd_queue)); + cmd_queue->StoreReadIndexRelaxed(value); +} + +/// @brief Api to store the Read Index of Queue using Release semantics +/// +/// @param queue Pointer to the queue whose read index is being updated +/// +/// @param value Value of new read index +void hsa_queue_store_read_index_release(const hsa_queue_t* queue, + uint64_t value) { + core::Queue* cmd_queue = core::Queue::Convert(queue); + assert(IsValid(cmd_queue)); + cmd_queue->StoreReadIndexRelease(value); +} + +/// @brief Api to store the Write Index of Queue using Relaxed semantics +/// +/// @param queue Pointer to the queue whose write index is being updated +/// +/// @param value Value of new write index +void hsa_queue_store_write_index_relaxed(const hsa_queue_t* queue, + uint64_t value) { + core::Queue* cmd_queue = core::Queue::Convert(queue); + assert(IsValid(cmd_queue)); + cmd_queue->StoreWriteIndexRelaxed(value); +} + +/// @brief Api to store the Write Index of Queue using Release semantics +/// +/// @param queue Pointer to the queue whose write index is being updated +/// +/// @param value Value of new write index +void hsa_queue_store_write_index_release(const hsa_queue_t* queue, + uint64_t value) { + core::Queue* cmd_queue = core::Queue::Convert(queue); + assert(IsValid(cmd_queue)); + cmd_queue->StoreWriteIndexRelease(value); +} + +/// @brief Api to compare and swap the Write Index of Queue using Acquire and +/// Release semantics +/// +/// @param queue Pointer to the queue whose write index is being updated +/// +/// @param expected Current value of write index +/// +/// @param value Value of new write index +/// +/// @return uint64_t Value of write index before the update +uint64_t hsa_queue_cas_write_index_acq_rel(const hsa_queue_t* queue, + uint64_t expected, + uint64_t value) { + core::Queue* cmd_queue = core::Queue::Convert(queue); + assert(IsValid(cmd_queue)); + return cmd_queue->CasWriteIndexAcqRel(expected, value); +} + +/// @brief Api to compare and swap the Write Index of Queue using Acquire +/// Semantics +/// +/// @param queue Pointer to the queue whose write index is being updated +/// +/// @param expected Current value of write index +/// +/// @param value Value of new write index +/// +/// @return uint64_t Value of write index before the update +uint64_t hsa_queue_cas_write_index_acquire(const hsa_queue_t* queue, + uint64_t expected, + uint64_t value) { + core::Queue* cmd_queue = core::Queue::Convert(queue); + assert(IsValid(cmd_queue)); + return cmd_queue->CasWriteIndexAcquire(expected, value); +} + +/// @brief Api to compare and swap the Write Index of Queue using Relaxed +/// Semantics +/// +/// @param queue Pointer to the queue whose write index is being updated +/// +/// @param expected Current value of write index +/// +/// @param value Value of new write index +/// +/// @return uint64_t Value of write index before the update +uint64_t hsa_queue_cas_write_index_relaxed(const hsa_queue_t* queue, + uint64_t expected, + uint64_t value) { + core::Queue* cmd_queue = core::Queue::Convert(queue); + assert(IsValid(cmd_queue)); + return cmd_queue->CasWriteIndexRelaxed(expected, value); +} + +/// @brief Api to compare and swap the Write Index of Queue using Release +/// Semantics +/// +/// @param queue Pointer to the queue whose write index is being updated +/// +/// @param expected Current value of write index +/// +/// @param value Value of new write index +/// +/// @return uint64_t Value of write index before the update +uint64_t hsa_queue_cas_write_index_release(const hsa_queue_t* queue, + uint64_t expected, + uint64_t value) { + core::Queue* cmd_queue = core::Queue::Convert(queue); + assert(IsValid(cmd_queue)); + return cmd_queue->CasWriteIndexRelease(expected, value); +} + +/// @brief Api to Add to the Write Index of Queue using Acquire and Release +/// Semantics +/// +/// @param queue Pointer to the queue whose write index is being updated +/// +/// @param value Value to add to write index +/// +/// @return uint64_t Value of write index before the update +uint64_t hsa_queue_add_write_index_acq_rel(const hsa_queue_t* queue, + uint64_t value) { + core::Queue* cmd_queue = core::Queue::Convert(queue); + assert(IsValid(cmd_queue)); + return cmd_queue->AddWriteIndexAcqRel(value); +} + +/// @brief Api to Add to the Write Index of Queue using Acquire Semantics +/// +/// @param queue Pointer to the queue whose write index is being updated +/// +/// @param value Value to add to write index +/// +/// @return uint64_t Value of write index before the update +uint64_t hsa_queue_add_write_index_acquire(const hsa_queue_t* queue, + uint64_t value) { + core::Queue* cmd_queue = core::Queue::Convert(queue); + assert(IsValid(cmd_queue)); + return cmd_queue->AddWriteIndexAcquire(value); +} + +/// @brief Api to Add to the Write Index of Queue using Relaxed Semantics +/// +/// @param queue Pointer to the queue whose write index is being updated +/// +/// @param value Value to add to write index +/// +/// @return uint64_t Value of write index before the update +uint64_t hsa_queue_add_write_index_relaxed(const hsa_queue_t* queue, + uint64_t value) { + core::Queue* cmd_queue = core::Queue::Convert(queue); + assert(IsValid(cmd_queue)); + return cmd_queue->AddWriteIndexRelaxed(value); +} + +/// @brief Api to Add to the Write Index of Queue using Release Semantics +/// +/// @param queue Pointer to the queue whose write index is being updated +/// +/// @param value Value to add to write index +/// +/// @return uint64_t Value of write index before the update +uint64_t hsa_queue_add_write_index_release(const hsa_queue_t* queue, + uint64_t value) { + core::Queue* cmd_queue = core::Queue::Convert(queue); + assert(IsValid(cmd_queue)); + return cmd_queue->AddWriteIndexRelease(value); +} + +//----------------------------------------------------------------------------- +// Memory +//----------------------------------------------------------------------------- +hsa_status_t hsa_agent_iterate_regions( + hsa_agent_t agent_handle, + hsa_status_t (*callback)(hsa_region_t region, void* data), void* data) { + IS_OPEN(); + IS_BAD_PTR(callback); + const core::Agent* agent = core::Agent::Convert(agent_handle); + IS_VALID(agent); + return agent->IterateRegion(callback, data); +} + +hsa_status_t hsa_region_get_info(hsa_region_t region, + hsa_region_info_t attribute, + void* value) { + IS_OPEN(); + IS_BAD_PTR(value); + + const core::MemoryRegion* mem_region = core::MemoryRegion::Convert(region); + IS_VALID(mem_region); + + return mem_region->GetInfo(attribute, value); +} + +hsa_status_t hsa_memory_register(void* address, size_t size) { + IS_OPEN(); + + if (size == 0 && address != NULL) { + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t hsa_memory_deregister(void* address, size_t size) { + IS_OPEN(); + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t + hsa_memory_allocate(hsa_region_t region, size_t size, void** ptr) { + IS_OPEN(); + + if (size == 0 || ptr == NULL) { + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + + const core::MemoryRegion* mem_region = core::MemoryRegion::Convert(region); + IS_VALID(mem_region); + + return core::Runtime::runtime_singleton_->AllocateMemory(mem_region, size, + ptr); +} + +hsa_status_t hsa_memory_free(void* ptr) { + IS_OPEN(); + + if (ptr == NULL) { + return HSA_STATUS_SUCCESS; + } + + return core::Runtime::runtime_singleton_->FreeMemory(ptr); +} + +hsa_status_t hsa_memory_assign_agent(void* ptr, + hsa_agent_t agent_handle, + hsa_access_permission_t access) { + IS_OPEN(); + + if ((ptr == NULL) || (access < HSA_ACCESS_PERMISSION_RO) || + (access > HSA_ACCESS_PERMISSION_RW)) { + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + + const core::Agent* agent = core::Agent::Convert(agent_handle); + IS_VALID(agent); + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t hsa_memory_copy(void* dst, const void* src, size_t size) { + IS_OPEN(); + + if (dst == NULL || src == NULL) { + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + + if (size == 0) { + return HSA_STATUS_SUCCESS; + } + + return core::Runtime::runtime_singleton_->CopyMemory(dst, src, size); +} + +//----------------------------------------------------------------------------- +// Signals +//----------------------------------------------------------------------------- + +typedef struct { + bool operator()(const hsa_agent_t& lhs, const hsa_agent_t& rhs) const { + return lhs.handle < rhs.handle; + } +} AgentHandleCompare; + +hsa_status_t + hsa_signal_create(hsa_signal_value_t initial_value, uint32_t num_consumers, + const hsa_agent_t* consumers, hsa_signal_t* hsa_signal) { + IS_OPEN(); + IS_BAD_PTR(hsa_signal); + + core::Signal* ret; + + bool useshost = true; + + if (num_consumers > 0) { + IS_BAD_PTR(consumers); + + // Check for duplicates in consumers. + std::set consumer_set = + std::set(consumers, + consumers + num_consumers); + if (consumer_set.size() != num_consumers) { + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + + useshost = + (consumer_set.find( + core::Runtime::runtime_singleton_->host_agent()->public_handle()) != + consumer_set.end()); + } + + if (core::g_use_interrupt_wait && useshost) { + ret = new core::InterruptSignal(initial_value); + } else { + ret = new core::DefaultSignal(initial_value); + } + CHECK_ALLOC(ret); + + *hsa_signal = core::Signal::Convert(ret); + + if (hsa_signal->handle == 0) { + delete ret; + return HSA_STATUS_ERROR_OUT_OF_RESOURCES; + } + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t hsa_signal_destroy(hsa_signal_t hsa_signal) { + IS_OPEN(); + + if (hsa_signal.handle == 0) { + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + + core::Signal* signal = core::Signal::Convert(hsa_signal); + IS_VALID(signal); + delete signal; + return HSA_STATUS_SUCCESS; +} + +hsa_signal_value_t hsa_signal_load_relaxed(hsa_signal_t hsa_signal) { + core::Signal* signal = core::Signal::Convert(hsa_signal); + assert(IsValid(signal)); + return signal->LoadRelaxed(); +} + +hsa_signal_value_t hsa_signal_load_acquire(hsa_signal_t hsa_signal) { + core::Signal* signal = core::Signal::Convert(hsa_signal); + assert(IsValid(signal)); + return signal->LoadAcquire(); +} + +void hsa_signal_store_relaxed(hsa_signal_t hsa_signal, + hsa_signal_value_t value) { + core::Signal* signal = core::Signal::Convert(hsa_signal); + assert(IsValid(signal)); + signal->StoreRelaxed(value); +} + +void hsa_signal_store_release(hsa_signal_t hsa_signal, + hsa_signal_value_t value) { + core::Signal* signal = core::Signal::Convert(hsa_signal); + assert(IsValid(signal)); + signal->StoreRelease(value); +} + +hsa_signal_value_t + hsa_signal_wait_relaxed(hsa_signal_t hsa_signal, + hsa_signal_condition_t condition, + hsa_signal_value_t compare_value, + uint64_t timeout_hint, + hsa_wait_state_t wait_state_hint) { + core::Signal* signal = core::Signal::Convert(hsa_signal); + assert(IsValid(signal)); + return signal->WaitRelaxed(condition, compare_value, timeout_hint, + wait_state_hint); +} + +hsa_signal_value_t + hsa_signal_wait_acquire(hsa_signal_t hsa_signal, + hsa_signal_condition_t condition, + hsa_signal_value_t compare_value, + uint64_t timeout_hint, + hsa_wait_state_t wait_state_hint) { + core::Signal* signal = core::Signal::Convert(hsa_signal); + assert(IsValid(signal)); + return signal->WaitAcquire(condition, compare_value, timeout_hint, + wait_state_hint); +} + +void + hsa_signal_and_relaxed(hsa_signal_t hsa_signal, hsa_signal_value_t value) { + core::Signal* signal = core::Signal::Convert(hsa_signal); + assert(IsValid(signal)); + signal->AndRelaxed(value); +} + +void + hsa_signal_and_acquire(hsa_signal_t hsa_signal, hsa_signal_value_t value) { + core::Signal* signal = core::Signal::Convert(hsa_signal); + assert(IsValid(signal)); + signal->AndAcquire(value); +} + +void + hsa_signal_and_release(hsa_signal_t hsa_signal, hsa_signal_value_t value) { + core::Signal* signal = core::Signal::Convert(hsa_signal); + assert(IsValid(signal)); + signal->AndRelease(value); +} + +void + hsa_signal_and_acq_rel(hsa_signal_t hsa_signal, hsa_signal_value_t value) { + core::Signal* signal = core::Signal::Convert(hsa_signal); + assert(IsValid(signal)); + signal->AndAcqRel(value); +} + +void + hsa_signal_or_relaxed(hsa_signal_t hsa_signal, hsa_signal_value_t value) { + core::Signal* signal = core::Signal::Convert(hsa_signal); + assert(IsValid(signal)); + signal->OrRelaxed(value); +} + +void + hsa_signal_or_acquire(hsa_signal_t hsa_signal, hsa_signal_value_t value) { + core::Signal* signal = core::Signal::Convert(hsa_signal); + assert(IsValid(signal)); + signal->OrAcquire(value); +} + +void + hsa_signal_or_release(hsa_signal_t hsa_signal, hsa_signal_value_t value) { + core::Signal* signal = core::Signal::Convert(hsa_signal); + assert(IsValid(signal)); + signal->OrRelease(value); +} + +void + hsa_signal_or_acq_rel(hsa_signal_t hsa_signal, hsa_signal_value_t value) { + core::Signal* signal = core::Signal::Convert(hsa_signal); + assert(IsValid(signal)); + signal->OrAcqRel(value); +} + +void + hsa_signal_xor_relaxed(hsa_signal_t hsa_signal, hsa_signal_value_t value) { + core::Signal* signal = core::Signal::Convert(hsa_signal); + assert(IsValid(signal)); + signal->XorRelaxed(value); +} + +void + hsa_signal_xor_acquire(hsa_signal_t hsa_signal, hsa_signal_value_t value) { + core::Signal* signal = core::Signal::Convert(hsa_signal); + assert(IsValid(signal)); + signal->XorAcquire(value); +} + +void + hsa_signal_xor_release(hsa_signal_t hsa_signal, hsa_signal_value_t value) { + core::Signal* signal = core::Signal::Convert(hsa_signal); + assert(IsValid(signal)); + signal->XorRelease(value); +} + +void + hsa_signal_xor_acq_rel(hsa_signal_t hsa_signal, hsa_signal_value_t value) { + core::Signal* signal = core::Signal::Convert(hsa_signal); + assert(IsValid(signal)); + signal->XorAcqRel(value); +} + +void + hsa_signal_add_relaxed(hsa_signal_t hsa_signal, hsa_signal_value_t value) { + core::Signal* signal = core::Signal::Convert(hsa_signal); + assert(IsValid(signal)); + return signal->AddRelaxed(value); +} + +void + hsa_signal_add_acquire(hsa_signal_t hsa_signal, hsa_signal_value_t value) { + core::Signal* signal = core::Signal::Convert(hsa_signal); + assert(IsValid(signal)); + signal->AddAcquire(value); +} + +void + hsa_signal_add_release(hsa_signal_t hsa_signal, hsa_signal_value_t value) { + core::Signal* signal = core::Signal::Convert(hsa_signal); + assert(IsValid(signal)); + signal->AddRelease(value); +} + +void + hsa_signal_add_acq_rel(hsa_signal_t hsa_signal, hsa_signal_value_t value) { + core::Signal* signal = core::Signal::Convert(hsa_signal); + assert(IsValid(signal)); + signal->AddAcqRel(value); +} + +void hsa_signal_subtract_relaxed(hsa_signal_t hsa_signal, + hsa_signal_value_t value) { + core::Signal* signal = core::Signal::Convert(hsa_signal); + assert(IsValid(signal)); + signal->SubRelaxed(value); +} + +void hsa_signal_subtract_acquire(hsa_signal_t hsa_signal, + hsa_signal_value_t value) { + core::Signal* signal = core::Signal::Convert(hsa_signal); + assert(IsValid(signal)); + signal->SubAcquire(value); +} + +void hsa_signal_subtract_release(hsa_signal_t hsa_signal, + hsa_signal_value_t value) { + core::Signal* signal = core::Signal::Convert(hsa_signal); + assert(IsValid(signal)); + signal->SubRelease(value); +} + +void hsa_signal_subtract_acq_rel(hsa_signal_t hsa_signal, + hsa_signal_value_t value) { + core::Signal* signal = core::Signal::Convert(hsa_signal); + assert(IsValid(signal)); + signal->SubAcqRel(value); +} + +hsa_signal_value_t + hsa_signal_exchange_relaxed(hsa_signal_t hsa_signal, + hsa_signal_value_t value) { + core::Signal* signal = core::Signal::Convert(hsa_signal); + assert(IsValid(signal)); + return signal->ExchRelaxed(value); +} + +hsa_signal_value_t + hsa_signal_exchange_acquire(hsa_signal_t hsa_signal, + hsa_signal_value_t value) { + core::Signal* signal = core::Signal::Convert(hsa_signal); + assert(IsValid(signal)); + return signal->ExchAcquire(value); +} + +hsa_signal_value_t + hsa_signal_exchange_release(hsa_signal_t hsa_signal, + hsa_signal_value_t value) { + core::Signal* signal = core::Signal::Convert(hsa_signal); + assert(IsValid(signal)); + return signal->ExchRelease(value); +} + +hsa_signal_value_t + hsa_signal_exchange_acq_rel(hsa_signal_t hsa_signal, + hsa_signal_value_t value) { + core::Signal* signal = core::Signal::Convert(hsa_signal); + assert(IsValid(signal)); + return signal->ExchAcqRel(value); +} + +hsa_signal_value_t hsa_signal_cas_relaxed(hsa_signal_t hsa_signal, + hsa_signal_value_t expected, + hsa_signal_value_t value) { + core::Signal* signal = core::Signal::Convert(hsa_signal); + assert(IsValid(signal)); + return signal->CasRelaxed(expected, value); +} + +hsa_signal_value_t hsa_signal_cas_acquire(hsa_signal_t hsa_signal, + hsa_signal_value_t expected, + hsa_signal_value_t value) { + core::Signal* signal = core::Signal::Convert(hsa_signal); + assert(IsValid(signal)); + return signal->CasAcquire(expected, value); +} + +hsa_signal_value_t hsa_signal_cas_release(hsa_signal_t hsa_signal, + hsa_signal_value_t expected, + hsa_signal_value_t value) { + core::Signal* signal = core::Signal::Convert(hsa_signal); + assert(IsValid(signal)); + return signal->CasRelease(expected, value); +} + +hsa_signal_value_t hsa_signal_cas_acq_rel(hsa_signal_t hsa_signal, + hsa_signal_value_t expected, + hsa_signal_value_t value) { + core::Signal* signal = core::Signal::Convert(hsa_signal); + assert(IsValid(signal)); + return signal->CasAcqRel(expected, value); +} + +//----------------------------------------------------------------------------- +// Isa +//----------------------------------------------------------------------------- + +hsa_status_t hsa_isa_from_name(const char* name, hsa_isa_t* isa) { + IS_OPEN(); + IS_BAD_PTR(name); + IS_BAD_PTR(isa); + + const core::Isa* isa_object = core::IsaRegistry::GetIsa(name); + if (!isa_object) { + return HSA_STATUS_ERROR_INVALID_ISA_NAME; + } + + *isa = core::Isa::Handle(isa_object); + return HSA_STATUS_SUCCESS; +} + +hsa_status_t hsa_isa_get_info(hsa_isa_t isa, hsa_isa_info_t attribute, + uint32_t index, void* value) { + IS_OPEN(); + IS_BAD_PTR(value); + + if (index != 0) { + return HSA_STATUS_ERROR_INVALID_INDEX; + } + + const core::Isa* isa_object = core::Isa::Object(isa); + IS_VALID(isa_object); + + return isa_object->GetInfo(attribute, value) ? + HSA_STATUS_SUCCESS : HSA_STATUS_ERROR_INVALID_ARGUMENT; +} + +hsa_status_t hsa_isa_compatible(hsa_isa_t code_object_isa, + hsa_isa_t agent_isa, bool* result) { + IS_OPEN(); + IS_BAD_PTR(result); + + const core::Isa* code_object_isa_object = core::Isa::Object(code_object_isa); + IS_VALID(code_object_isa_object); + + const core::Isa* agent_isa_object = core::Isa::Object(agent_isa); + IS_VALID(agent_isa_object); + + *result = code_object_isa_object->IsCompatible(agent_isa_object); + return HSA_STATUS_SUCCESS; +} + +//----------------------------------------------------------------------------- +// Code object. +//----------------------------------------------------------------------------- + +namespace { + +hsa_status_t IsCodeObjectAllocRegion(hsa_region_t region, void *data) +{ + assert(nullptr != data); + assert(0 == ((hsa_region_t*)data)->handle); + + hsa_status_t status = HSA_STATUS_SUCCESS; + bool alloc_allowed; + if (HSA_STATUS_SUCCESS != (status = HSA::hsa_region_get_info(region, HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED, &alloc_allowed))) { + return status; + } + if (true == alloc_allowed) { + ((hsa_region_t*)data)->handle = region.handle; + return HSA_STATUS_INFO_BREAK; + } + return HSA_STATUS_SUCCESS; +} + +hsa_status_t FindCodeObjectAllocRegionFromAgent(hsa_agent_t agent, void *data) +{ + assert(nullptr != data); + assert(0 == ((hsa_region_t*)data)->handle); + + hsa_status_t status = HSA_STATUS_SUCCESS; + hsa_device_type_t agent_type; + if (HSA_STATUS_SUCCESS != (status = HSA::hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &agent_type))) { + return status; + } + if (HSA_DEVICE_TYPE_CPU == agent_type) { + return HSA::hsa_agent_iterate_regions(agent, IsCodeObjectAllocRegion, data); + } + return HSA_STATUS_SUCCESS; +} + +hsa_status_t FindCodeObjectAllocRegionFromSystem(void *data) +{ + assert(nullptr != data); + + ((hsa_region_t*)data)->handle = 0; + return HSA::hsa_iterate_agents(FindCodeObjectAllocRegionFromAgent, data); +} + +} // namespace anonymous + +hsa_status_t hsa_code_object_serialize( + hsa_code_object_t code_object, + hsa_status_t (*alloc_callback)(size_t size, hsa_callback_data_t data, + void** address), + hsa_callback_data_t callback_data, const char* options, + void** serialized_code_object, size_t* serialized_code_object_size) { + IS_OPEN(); + IS_BAD_PTR(alloc_callback); + IS_BAD_PTR(serialized_code_object); + IS_BAD_PTR(serialized_code_object_size); + + AmdHsaCode* code = core::Runtime::runtime_singleton_->code_manager()->FromHandle(code_object); + if (!code) { return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; } + size_t elfmemsz = code->ElfSize(); + const char* elfmemrd = code->ElfData(); + + hsa_status_t hsc = alloc_callback(elfmemsz, + callback_data, + serialized_code_object); + if (HSA_STATUS_SUCCESS != hsc) { + return hsc; + } + + memcpy(*serialized_code_object, elfmemrd, elfmemsz); + *serialized_code_object_size = elfmemsz; + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t + hsa_code_object_deserialize(void* serialized_code_object, + size_t serialized_code_object_size, + const char* options, + hsa_code_object_t* code_object) { + IS_OPEN(); + IS_BAD_PTR(serialized_code_object); + IS_BAD_PTR(code_object); + + if (!serialized_code_object_size) { + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + + hsa_status_t status = HSA_STATUS_SUCCESS; + + // Find code object allocation region. + hsa_region_t code_object_alloc_region; + status = FindCodeObjectAllocRegionFromSystem(&code_object_alloc_region); + if (HSA_STATUS_SUCCESS != status && HSA_STATUS_INFO_BREAK != status) { + return status; + } + assert(0 != code_object_alloc_region.handle); + + // Allocate code object memory. + void *code_object_alloc_mem = nullptr; + status = HSA::hsa_memory_allocate(code_object_alloc_region, + serialized_code_object_size, + &code_object_alloc_mem); + if (HSA_STATUS_SUCCESS != status) { + return status; + } + assert(nullptr != code_object_alloc_mem); + + // Copy code object into allocated code object memory. + status = HSA::hsa_memory_copy(code_object_alloc_mem, + serialized_code_object, + serialized_code_object_size); + if (HSA_STATUS_SUCCESS != status) { + return status; + } + code_object->handle = (uint64_t) (uintptr_t) code_object_alloc_mem; + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t hsa_code_object_destroy(hsa_code_object_t code_object) { + IS_OPEN(); + + void *elfmemrd = reinterpret_cast(code_object.handle); + if (!elfmemrd) { + return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; + } + + if (!core::Runtime::runtime_singleton_->code_manager()->Destroy(code_object)) { + return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; + } + + HSA::hsa_memory_free(elfmemrd); + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t hsa_code_object_get_info(hsa_code_object_t code_object, + hsa_code_object_info_t attribute, + void* value) { + IS_OPEN(); + IS_BAD_PTR(value); + + AmdHsaCode* code = core::Runtime::runtime_singleton_->code_manager()->FromHandle(code_object); + if (!code) { + return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; + } + switch (attribute) { + case HSA_CODE_OBJECT_INFO_ISA: { + // TODO: currently AmdHsaCode::GetInfo return string representation. + // Fix when compute capability is available in libamdhsacode. + char isa_name[64]; + hsa_status_t status = code->GetInfo(attribute, &isa_name); + if (status != HSA_STATUS_SUCCESS) { return status; } + if (HSA_STATUS_SUCCESS != HSA::hsa_isa_from_name(isa_name, (hsa_isa_t*)value)) { + return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; + } + return HSA_STATUS_SUCCESS; + } + default: + return code->GetInfo(attribute, value); + } +} + +hsa_status_t hsa_code_object_get_symbol(hsa_code_object_t code_object, + const char *symbol_name, + hsa_code_symbol_t *symbol) { + IS_OPEN(); + IS_BAD_PTR(symbol_name); + IS_BAD_PTR(symbol); + + AmdHsaCode* code = core::Runtime::runtime_singleton_->code_manager()->FromHandle(code_object); + if (!code) { + return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; + } + + // TODO(kzhuravl): module_name is NULL until spec is changed, waiting for + // Mario. + return code->GetSymbol(NULL, symbol_name, symbol); +} + +hsa_status_t hsa_code_symbol_get_info(hsa_code_symbol_t code_symbol, + hsa_code_symbol_info_t attribute, + void* value) { + IS_OPEN(); + IS_BAD_PTR(value); + + Symbol* sym = Symbol::FromHandle(code_symbol); + return sym->GetInfo(attribute, value); +} + +hsa_status_t hsa_code_object_iterate_symbols( + hsa_code_object_t code_object, + hsa_status_t (*callback)(hsa_code_object_t code_object, + hsa_code_symbol_t symbol, void* data), + void* data) { + IS_OPEN(); + IS_BAD_PTR(callback); + + AmdHsaCode* code = core::Runtime::runtime_singleton_->code_manager()->FromHandle(code_object); + if (!code) { + return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; + } + + return code->IterateSymbols(code_object, callback, data); +} + +//----------------------------------------------------------------------------- +// Executable +//----------------------------------------------------------------------------- + +hsa_status_t + hsa_executable_create(hsa_profile_t profile, + hsa_executable_state_t executable_state, + const char* options, hsa_executable_t* executable) { + IS_OPEN(); + IS_BAD_PTR(executable); + + if (HSA_PROFILE_BASE != profile && HSA_PROFILE_FULL != profile) { + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + if (HSA_EXECUTABLE_STATE_FROZEN != executable_state && + HSA_EXECUTABLE_STATE_UNFROZEN != executable_state) { + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + + amd::hsa::loader::Executable *exec = core::Runtime::runtime_singleton_->loader()->CreateExecutable( + profile, options); + if (!exec) { + return HSA_STATUS_ERROR_OUT_OF_RESOURCES; + } + + // @todo(spec): why did we make it possible to create frozen executable? + if (HSA_EXECUTABLE_STATE_FROZEN == executable_state) { + exec->Freeze(NULL); + } + + *executable = amd::hsa::loader::Executable::Handle(exec); + return HSA_STATUS_SUCCESS; +} + +hsa_status_t hsa_executable_destroy(hsa_executable_t executable) { + IS_OPEN(); + + amd::hsa::loader::Executable *exec = amd::hsa::loader::Executable::Object(executable); + if (!exec) { + return HSA_STATUS_ERROR_INVALID_EXECUTABLE; + } + + core::Runtime::runtime_singleton_->loader()->DestroyExecutable(exec); + return HSA_STATUS_SUCCESS; +} + +hsa_status_t + hsa_executable_load_code_object(hsa_executable_t executable, + hsa_agent_t agent, + hsa_code_object_t code_object, + const char* options) { + amd_loaded_code_object_t loaded_code_object = {0}; + return amd_executable_load_code_object( + executable, agent, code_object, options, &loaded_code_object); +} + +hsa_status_t + hsa_executable_freeze(hsa_executable_t executable, const char* options) { + IS_OPEN(); + + amd::hsa::loader::Executable *exec = amd::hsa::loader::Executable::Object(executable); + if (!exec) { + return HSA_STATUS_ERROR_INVALID_EXECUTABLE; + } + + return exec->Freeze(options); +} + +hsa_status_t hsa_executable_get_info(hsa_executable_t executable, + hsa_executable_info_t attribute, + void* value) { + IS_OPEN(); + IS_BAD_PTR(value); + + amd::hsa::loader::Executable *exec = amd::hsa::loader::Executable::Object(executable); + if (!exec) { + return HSA_STATUS_ERROR_INVALID_EXECUTABLE; + } + + return exec->GetInfo(attribute, value); +} + +hsa_status_t + hsa_executable_global_variable_define(hsa_executable_t executable, + const char* variable_name, + void* address) { + IS_OPEN(); + IS_BAD_PTR(variable_name); + IS_BAD_PTR(address); + + amd::hsa::loader::Executable *exec = amd::hsa::loader::Executable::Object(executable); + if (!exec) { + return HSA_STATUS_ERROR_INVALID_EXECUTABLE; + } + + return exec->DefineProgramExternalVariable(variable_name, address); +} + +hsa_status_t + hsa_executable_agent_global_variable_define(hsa_executable_t executable, + hsa_agent_t agent, + const char* variable_name, + void* address) { + IS_OPEN(); + IS_BAD_PTR(variable_name); + IS_BAD_PTR(address); + + amd::hsa::loader::Executable *exec = amd::hsa::loader::Executable::Object(executable); + if (!exec) { + return HSA_STATUS_ERROR_INVALID_EXECUTABLE; + } + + return exec->DefineAgentExternalVariable( + variable_name, agent, HSA_VARIABLE_SEGMENT_GLOBAL, address); +} + +hsa_status_t + hsa_executable_readonly_variable_define(hsa_executable_t executable, + hsa_agent_t agent, + const char* variable_name, + void* address) { + IS_OPEN(); + IS_BAD_PTR(variable_name); + IS_BAD_PTR(address); + + amd::hsa::loader::Executable *exec = amd::hsa::loader::Executable::Object(executable); + if (!exec) { + return HSA_STATUS_ERROR_INVALID_EXECUTABLE; + } + + return exec->DefineAgentExternalVariable( + variable_name, agent, HSA_VARIABLE_SEGMENT_READONLY, address); +} + +hsa_status_t + hsa_executable_validate(hsa_executable_t executable, uint32_t* result) { + IS_OPEN(); + IS_BAD_PTR(result); + + amd::hsa::loader::Executable *exec = amd::hsa::loader::Executable::Object(executable); + if (!exec) { + return HSA_STATUS_ERROR_INVALID_EXECUTABLE; + } + + return exec->Validate(result); +} + +hsa_status_t + hsa_executable_get_symbol(hsa_executable_t executable, + const char* module_name, const char* symbol_name, + hsa_agent_t agent, int32_t call_convention, + hsa_executable_symbol_t* symbol) { + IS_OPEN(); + IS_BAD_PTR(symbol_name); + IS_BAD_PTR(symbol); + + amd::hsa::loader::Executable *exec = amd::hsa::loader::Executable::Object(executable); + if (!exec) { + return HSA_STATUS_ERROR_INVALID_EXECUTABLE; + } + + amd::hsa::loader::Symbol *sym = + exec->GetSymbol(module_name == NULL ? "" : module_name, symbol_name, agent, call_convention); + if (!sym) { + return HSA_STATUS_ERROR_INVALID_SYMBOL_NAME; + } + *symbol = amd::hsa::loader::Symbol::Handle(sym); + return HSA_STATUS_SUCCESS; +} + +hsa_status_t + hsa_executable_symbol_get_info(hsa_executable_symbol_t executable_symbol, + hsa_executable_symbol_info_t attribute, + void* value) { + IS_OPEN(); + IS_BAD_PTR(value); + + amd::hsa::loader::Symbol *sym = amd::hsa::loader::Symbol::Object(executable_symbol); + if (!sym) { + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + return sym->GetInfo(attribute, value) ? + HSA_STATUS_SUCCESS : HSA_STATUS_ERROR_INVALID_ARGUMENT; +} + +hsa_status_t hsa_executable_iterate_symbols( + hsa_executable_t executable, + hsa_status_t (*callback)(hsa_executable_t executable, + hsa_executable_symbol_t symbol, void* data), + void* data) { + IS_OPEN(); + IS_BAD_PTR(callback); + + amd::hsa::loader::Executable *exec = amd::hsa::loader::Executable::Object(executable); + if (!exec) { + return HSA_STATUS_ERROR_INVALID_EXECUTABLE; + } + + return exec->IterateSymbols(callback, data); +} + +//----------------------------------------------------------------------------- +// Errors +//----------------------------------------------------------------------------- + +hsa_status_t + hsa_status_string(hsa_status_t status, const char** status_string) { + IS_OPEN(); + IS_BAD_PTR(status_string); + const size_t status_u = static_cast(status); + switch (status_u) { + case HSA_STATUS_SUCCESS: + *status_string = + "HSA_STATUS_SUCCESS: The function has been executed successfully."; + break; + case HSA_STATUS_INFO_BREAK: + *status_string = + "HSA_STATUS_INFO_BREAK: A traversal over a list of " + "elements has been interrupted by the application before " + "completing."; + break; + case HSA_STATUS_ERROR: + *status_string = "HSA_STATUS_ERROR: A generic error has occurred."; + break; + case HSA_STATUS_ERROR_INVALID_ARGUMENT: + *status_string = + "HSA_STATUS_ERROR_INVALID_ARGUMENT: One of the actual " + "arguments does not meet a precondition stated in the " + "documentation of the corresponding formal argument."; + break; + case HSA_STATUS_ERROR_INVALID_QUEUE_CREATION: + *status_string = + "HSA_STATUS_ERROR_INVALID_QUEUE_CREATION: The requested " + "queue creation is not valid."; + break; + case HSA_STATUS_ERROR_INVALID_ALLOCATION: + *status_string = + "HSA_STATUS_ERROR_INVALID_ALLOCATION: The requested " + "allocation is not valid."; + break; + case HSA_STATUS_ERROR_INVALID_AGENT: + *status_string = + "HSA_STATUS_ERROR_INVALID_AGENT: The agent is invalid."; + break; + case HSA_STATUS_ERROR_INVALID_REGION: + *status_string = + "HSA_STATUS_ERROR_INVALID_REGION: The memory region is invalid."; + break; + case HSA_STATUS_ERROR_INVALID_SIGNAL: + *status_string = + "HSA_STATUS_ERROR_INVALID_SIGNAL: The signal is invalid."; + break; + case HSA_STATUS_ERROR_INVALID_QUEUE: + *status_string = + "HSA_STATUS_ERROR_INVALID_QUEUE: The queue is invalid."; + break; + case HSA_STATUS_ERROR_OUT_OF_RESOURCES: + *status_string = + "HSA_STATUS_ERROR_OUT_OF_RESOURCES: The runtime failed to " + "allocate the necessary resources. This error may also " + "occur when the core runtime library needs to spawn " + "threads or create internal OS-specific events."; + break; + case HSA_STATUS_ERROR_INVALID_PACKET_FORMAT: + *status_string = + "HSA_STATUS_ERROR_INVALID_PACKET_FORMAT: The AQL packet " + "is malformed."; + break; + case HSA_STATUS_ERROR_RESOURCE_FREE: + *status_string = + "HSA_STATUS_ERROR_RESOURCE_FREE: An error has been " + "detected while releasing a resource."; + break; + case HSA_STATUS_ERROR_NOT_INITIALIZED: + *status_string = + "HSA_STATUS_ERROR_NOT_INITIALIZED: An API other than " + "hsa_init has been invoked while the reference count of " + "the HSA runtime is zero."; + break; + case HSA_STATUS_ERROR_REFCOUNT_OVERFLOW: + *status_string = + "HSA_STATUS_ERROR_REFCOUNT_OVERFLOW: The maximum " + "reference count for the object has been reached."; + break; + case HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS: + *status_string = + "HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS: The arguments passed to " + "a functions are not compatible."; + break; + case HSA_STATUS_ERROR_INVALID_INDEX: + *status_string = "The index is invalid."; + break; + case HSA_STATUS_ERROR_INVALID_ISA: + *status_string = "The instruction set architecture is invalid."; + break; + case HSA_STATUS_ERROR_INVALID_CODE_OBJECT: + *status_string = "The code object is invalid."; + break; + case HSA_STATUS_ERROR_INVALID_EXECUTABLE: + *status_string = "The executable is invalid."; + break; + case HSA_STATUS_ERROR_FROZEN_EXECUTABLE: + *status_string = "The executable is frozen."; + break; + case HSA_STATUS_ERROR_INVALID_SYMBOL_NAME: + *status_string = "There is no symbol with the given name."; + break; + case HSA_STATUS_ERROR_VARIABLE_ALREADY_DEFINED: + *status_string = "The variable is already defined."; + break; + case HSA_STATUS_ERROR_VARIABLE_UNDEFINED: + *status_string = "The variable is undefined."; + break; + case HSA_EXT_STATUS_ERROR_IMAGE_FORMAT_UNSUPPORTED: + *status_string = + "HSA_EXT_STATUS_ERROR_IMAGE_FORMAT_UNSUPPORTED: Image " + "format is not supported."; + break; + case HSA_EXT_STATUS_ERROR_IMAGE_SIZE_UNSUPPORTED: + *status_string = + "HSA_EXT_STATUS_ERROR_IMAGE_SIZE_UNSUPPORTED: Image size " + "is not supported."; + break; + case HSA_EXT_STATUS_ERROR_INVALID_PROGRAM: + *status_string = + "HSA_EXT_STATUS_ERROR_INVALID_PROGRAM: Invalid program"; + break; + case HSA_EXT_STATUS_ERROR_INVALID_MODULE: + *status_string = "HSA_EXT_STATUS_ERROR_INVALID_MODULE: Invalid module"; + break; + case HSA_EXT_STATUS_ERROR_INCOMPATIBLE_MODULE: + *status_string = + "HSA_EXT_STATUS_ERROR_INCOMPATIBLE_MODULE: Incompatible module"; + break; + case HSA_EXT_STATUS_ERROR_MODULE_ALREADY_INCLUDED: + *status_string = + "HSA_EXT_STATUS_ERROR_MODULE_ALREADY_INCLUDED: Module already " + "included"; + break; + case HSA_EXT_STATUS_ERROR_SYMBOL_MISMATCH: + *status_string = + "HSA_EXT_STATUS_ERROR_SYMBOL_MISMATCH: Symbol mismatch"; + break; + case HSA_EXT_STATUS_ERROR_FINALIZATION_FAILED: + *status_string = + "HSA_EXT_STATUS_ERROR_FINALIZATION_FAILED: Finalization failed"; + break; + case HSA_EXT_STATUS_ERROR_DIRECTIVE_MISMATCH: + *status_string = + "HSA_EXT_STATUS_ERROR_DIRECTIVE_MISMATCH: Directive mismatch"; + break; + default: + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + return HSA_STATUS_SUCCESS; +} + +} // end of namespace HSA diff --git a/runtime/hsa-runtime/core/runtime/hsa_api_trace.cpp b/runtime/hsa-runtime/core/runtime/hsa_api_trace.cpp new file mode 100644 index 0000000000..0eeab7448c --- /dev/null +++ b/runtime/hsa-runtime/core/runtime/hsa_api_trace.cpp @@ -0,0 +1,191 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#include "core/inc/hsa_api_trace_int.h" +#include "core/inc/runtime.h" +#include "core/inc/hsa_table_interface.h" + +namespace core { + +ApiTable hsa_api_table_; +ApiTable hsa_internal_api_table_; + +ApiTable::ApiTable() { + table.std_exts_ = NULL; + Reset(); +} + +void ApiTable::LinkExts(ExtTable* ptr) { + assert(ptr != NULL && "Invalid extension table linked."); + extension_backup = *ptr; + table.std_exts_ = ptr; +} + +void ApiTable::Reset() { + table.hsa_init_fn = HSA::hsa_init; + table.hsa_shut_down_fn = HSA::hsa_shut_down; + table.hsa_system_get_info_fn = HSA::hsa_system_get_info; + table.hsa_system_extension_supported_fn = HSA::hsa_system_extension_supported; + table.hsa_system_get_extension_table_fn = HSA::hsa_system_get_extension_table; + table.hsa_iterate_agents_fn = HSA::hsa_iterate_agents; + table.hsa_agent_get_info_fn = HSA::hsa_agent_get_info; + table.hsa_agent_get_exception_policies_fn = + HSA::hsa_agent_get_exception_policies; + table.hsa_agent_extension_supported_fn = HSA::hsa_agent_extension_supported; + table.hsa_queue_create_fn = HSA::hsa_queue_create; + table.hsa_soft_queue_create_fn = HSA::hsa_soft_queue_create; + table.hsa_queue_destroy_fn = HSA::hsa_queue_destroy; + table.hsa_queue_inactivate_fn = HSA::hsa_queue_inactivate; + table.hsa_queue_load_read_index_acquire_fn = + HSA::hsa_queue_load_read_index_acquire; + table.hsa_queue_load_read_index_relaxed_fn = + HSA::hsa_queue_load_read_index_relaxed; + table.hsa_queue_load_write_index_acquire_fn = + HSA::hsa_queue_load_write_index_acquire; + table.hsa_queue_load_write_index_relaxed_fn = + HSA::hsa_queue_load_write_index_relaxed; + table.hsa_queue_store_write_index_relaxed_fn = + HSA::hsa_queue_store_write_index_relaxed; + table.hsa_queue_store_write_index_release_fn = + HSA::hsa_queue_store_write_index_release; + table.hsa_queue_cas_write_index_acq_rel_fn = + HSA::hsa_queue_cas_write_index_acq_rel; + table.hsa_queue_cas_write_index_acquire_fn = + HSA::hsa_queue_cas_write_index_acquire; + table.hsa_queue_cas_write_index_relaxed_fn = + HSA::hsa_queue_cas_write_index_relaxed; + table.hsa_queue_cas_write_index_release_fn = + HSA::hsa_queue_cas_write_index_release; + table.hsa_queue_add_write_index_acq_rel_fn = + HSA::hsa_queue_add_write_index_acq_rel; + table.hsa_queue_add_write_index_acquire_fn = + HSA::hsa_queue_add_write_index_acquire; + table.hsa_queue_add_write_index_relaxed_fn = + HSA::hsa_queue_add_write_index_relaxed; + table.hsa_queue_add_write_index_release_fn = + HSA::hsa_queue_add_write_index_release; + table.hsa_queue_store_read_index_relaxed_fn = + HSA::hsa_queue_store_read_index_relaxed; + table.hsa_queue_store_read_index_release_fn = + HSA::hsa_queue_store_read_index_release; + table.hsa_agent_iterate_regions_fn = HSA::hsa_agent_iterate_regions; + table.hsa_region_get_info_fn = HSA::hsa_region_get_info; + table.hsa_memory_register_fn = HSA::hsa_memory_register; + table.hsa_memory_deregister_fn = HSA::hsa_memory_deregister; + table.hsa_memory_allocate_fn = HSA::hsa_memory_allocate; + table.hsa_memory_free_fn = HSA::hsa_memory_free; + table.hsa_memory_copy_fn = HSA::hsa_memory_copy; + table.hsa_memory_assign_agent_fn = HSA::hsa_memory_assign_agent; + table.hsa_signal_create_fn = HSA::hsa_signal_create; + table.hsa_signal_destroy_fn = HSA::hsa_signal_destroy; + table.hsa_signal_load_relaxed_fn = HSA::hsa_signal_load_relaxed; + table.hsa_signal_load_acquire_fn = HSA::hsa_signal_load_acquire; + table.hsa_signal_store_relaxed_fn = HSA::hsa_signal_store_relaxed; + table.hsa_signal_store_release_fn = HSA::hsa_signal_store_release; + table.hsa_signal_wait_relaxed_fn = HSA::hsa_signal_wait_relaxed; + table.hsa_signal_wait_acquire_fn = HSA::hsa_signal_wait_acquire; + table.hsa_signal_and_relaxed_fn = HSA::hsa_signal_and_relaxed; + table.hsa_signal_and_acquire_fn = HSA::hsa_signal_and_acquire; + table.hsa_signal_and_release_fn = HSA::hsa_signal_and_release; + table.hsa_signal_and_acq_rel_fn = HSA::hsa_signal_and_acq_rel; + table.hsa_signal_or_relaxed_fn = HSA::hsa_signal_or_relaxed; + table.hsa_signal_or_acquire_fn = HSA::hsa_signal_or_acquire; + table.hsa_signal_or_release_fn = HSA::hsa_signal_or_release; + table.hsa_signal_or_acq_rel_fn = HSA::hsa_signal_or_acq_rel; + table.hsa_signal_xor_relaxed_fn = HSA::hsa_signal_xor_relaxed; + table.hsa_signal_xor_acquire_fn = HSA::hsa_signal_xor_acquire; + table.hsa_signal_xor_release_fn = HSA::hsa_signal_xor_release; + table.hsa_signal_xor_acq_rel_fn = HSA::hsa_signal_xor_acq_rel; + table.hsa_signal_exchange_relaxed_fn = HSA::hsa_signal_exchange_relaxed; + table.hsa_signal_exchange_acquire_fn = HSA::hsa_signal_exchange_acquire; + table.hsa_signal_exchange_release_fn = HSA::hsa_signal_exchange_release; + table.hsa_signal_exchange_acq_rel_fn = HSA::hsa_signal_exchange_acq_rel; + table.hsa_signal_add_relaxed_fn = HSA::hsa_signal_add_relaxed; + table.hsa_signal_add_acquire_fn = HSA::hsa_signal_add_acquire; + table.hsa_signal_add_release_fn = HSA::hsa_signal_add_release; + table.hsa_signal_add_acq_rel_fn = HSA::hsa_signal_add_acq_rel; + table.hsa_signal_subtract_relaxed_fn = HSA::hsa_signal_subtract_relaxed; + table.hsa_signal_subtract_acquire_fn = HSA::hsa_signal_subtract_acquire; + table.hsa_signal_subtract_release_fn = HSA::hsa_signal_subtract_release; + table.hsa_signal_subtract_acq_rel_fn = HSA::hsa_signal_subtract_acq_rel; + table.hsa_signal_cas_relaxed_fn = HSA::hsa_signal_cas_relaxed; + table.hsa_signal_cas_acquire_fn = HSA::hsa_signal_cas_acquire; + table.hsa_signal_cas_release_fn = HSA::hsa_signal_cas_release; + table.hsa_signal_cas_acq_rel_fn = HSA::hsa_signal_cas_acq_rel; + table.hsa_isa_from_name_fn = HSA::hsa_isa_from_name; + table.hsa_isa_get_info_fn = HSA::hsa_isa_get_info; + table.hsa_isa_compatible_fn = HSA::hsa_isa_compatible; + table.hsa_code_object_serialize_fn = HSA::hsa_code_object_serialize; + table.hsa_code_object_deserialize_fn = HSA::hsa_code_object_deserialize; + table.hsa_code_object_destroy_fn = HSA::hsa_code_object_destroy; + table.hsa_code_object_get_info_fn = HSA::hsa_code_object_get_info; + table.hsa_code_object_get_symbol_fn = HSA::hsa_code_object_get_symbol; + table.hsa_code_symbol_get_info_fn = HSA::hsa_code_symbol_get_info; + table.hsa_code_object_iterate_symbols_fn = + HSA::hsa_code_object_iterate_symbols; + table.hsa_executable_create_fn = HSA::hsa_executable_create; + table.hsa_executable_destroy_fn = HSA::hsa_executable_destroy; + table.hsa_executable_load_code_object_fn = + HSA::hsa_executable_load_code_object; + table.hsa_executable_freeze_fn = HSA::hsa_executable_freeze; + table.hsa_executable_get_info_fn = HSA::hsa_executable_get_info; + table.hsa_executable_global_variable_define_fn = + HSA::hsa_executable_global_variable_define; + table.hsa_executable_agent_global_variable_define_fn = + HSA::hsa_executable_agent_global_variable_define; + table.hsa_executable_readonly_variable_define_fn = + HSA::hsa_executable_readonly_variable_define; + table.hsa_executable_validate_fn = HSA::hsa_executable_validate; + table.hsa_executable_get_symbol_fn = HSA::hsa_executable_get_symbol; + table.hsa_executable_symbol_get_info_fn = HSA::hsa_executable_symbol_get_info; + table.hsa_executable_iterate_symbols_fn = HSA::hsa_executable_iterate_symbols; + table.hsa_status_string_fn = HSA::hsa_status_string; + + if (table.std_exts_ != NULL) *table.std_exts_ = extension_backup; +} + +class Init { + public: + Init() { hsa_table_interface_init(&hsa_api_table_.table); } +}; +static Init LinkAtLoad; +} diff --git a/runtime/hsa-runtime/core/runtime/hsa_ext_amd.cpp b/runtime/hsa-runtime/core/runtime/hsa_ext_amd.cpp new file mode 100644 index 0000000000..9394c30062 --- /dev/null +++ b/runtime/hsa-runtime/core/runtime/hsa_ext_amd.cpp @@ -0,0 +1,555 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#include "hsakmt.h" + +#include "core/inc/runtime.h" +#include "core/inc/agent.h" +#include "core/inc/amd_cpu_agent.h" +#include "core/inc/amd_gpu_agent.h" +#include "core/inc/amd_memory_region.h" +#include "core/inc/signal.h" +#include "core/inc/interrupt_signal.h" + +template +struct ValidityError; +template <> +struct ValidityError { + enum { value = HSA_STATUS_ERROR_INVALID_SIGNAL }; +}; + +template <> +struct ValidityError { + enum { value = HSA_STATUS_ERROR_INVALID_AGENT }; +}; + +template <> +struct ValidityError { + enum { value = HSA_STATUS_ERROR_INVALID_REGION }; +}; + +template <> +struct ValidityError { + enum { value = HSA_STATUS_ERROR_INVALID_REGION }; +}; + +template <> +struct ValidityError { + enum { value = HSA_STATUS_ERROR_INVALID_QUEUE }; +}; + +template +struct ValidityError { + enum { value = ValidityError::value }; +}; + +#define IS_BAD_PTR(ptr) \ + do { \ + if ((ptr) == NULL) return HSA_STATUS_ERROR_INVALID_ARGUMENT; \ + } while (false) + +#define IS_VALID(ptr) \ + do { \ + if ((ptr) == NULL || !(ptr)->IsValid()) \ + return hsa_status_t(ValidityError::value); \ + } while (false) + +#define CHECK_ALLOC(ptr) \ + do { \ + if ((ptr) == NULL) return HSA_STATUS_ERROR_OUT_OF_RESOURCES; \ + } while (false) + +#define IS_OPEN() \ + do { \ + if (!core::Runtime::runtime_singleton_->IsOpen()) \ + return HSA_STATUS_ERROR_NOT_INITIALIZED; \ + } while (false) + +template +static __forceinline bool IsValid(T* ptr) { + return (ptr == NULL) ? NULL : ptr->IsValid(); +} + +hsa_status_t HSA_API + hsa_amd_coherency_get_type(hsa_agent_t agent_handle, + hsa_amd_coherency_type_t* type) { + IS_OPEN(); + + const core::Agent* agent = core::Agent::Convert(agent_handle); + + IS_VALID(agent); + + IS_BAD_PTR(type); + + if (agent->device_type() != core::Agent::kAmdGpuDevice) { + return HSA_STATUS_ERROR_INVALID_AGENT; + } + + const amd::GpuAgentInt* gpu_agent = + static_cast(agent); + + *type = gpu_agent->current_coherency_type(); + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t HSA_API hsa_amd_coherency_set_type(hsa_agent_t agent_handle, + hsa_amd_coherency_type_t type) { + IS_OPEN(); + + core::Agent* agent = core::Agent::Convert(agent_handle); + + IS_VALID(agent); + + if (type < HSA_AMD_COHERENCY_TYPE_COHERENT || + type > HSA_AMD_COHERENCY_TYPE_NONCOHERENT) { + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + + if (agent->device_type() != core::Agent::kAmdGpuDevice) { + return HSA_STATUS_ERROR_INVALID_AGENT; + } + + amd::GpuAgent* gpu_agent = static_cast(agent); + + if (!gpu_agent->current_coherency_type(type)) { + return HSA_STATUS_ERROR; + } + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t HSA_API + hsa_amd_memory_fill(void* ptr, uint32_t value, size_t count) { + IS_OPEN(); + + if (ptr == NULL) { + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + + if (count == 0) { + return HSA_STATUS_SUCCESS; + } + + return core::Runtime::runtime_singleton_->FillMemory(ptr, value, count); +} + +hsa_status_t HSA_API + hsa_amd_memory_async_copy(void* dst, hsa_agent_t dst_agent_handle, + const void* src, hsa_agent_t src_agent_handle, + size_t size, uint32_t num_dep_signals, + const hsa_signal_t* dep_signals, + hsa_signal_t completion_signal) { + if (dst == NULL || src == NULL) { + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + + if ((num_dep_signals == 0 && dep_signals != NULL) || + (num_dep_signals > 0 && dep_signals == NULL)) { + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + + core::Agent* dst_agent = core::Agent::Convert(dst_agent_handle); + IS_VALID(dst_agent); + + core::Agent* src_agent = core::Agent::Convert(src_agent_handle); + IS_VALID(src_agent); + + std::vector dep_signal_list(num_dep_signals); + if (num_dep_signals > 0) { + for (size_t i = 0; i < num_dep_signals; ++i) { + core::Signal* dep_signal_obj = core::Signal::Convert(dep_signals[i]); + IS_VALID(dep_signal_obj); + dep_signal_list[i] = dep_signal_obj; + } + } + + core::Signal* out_signal_obj = core::Signal::Convert(completion_signal); + IS_VALID(out_signal_obj); + + if (size > 0) { + return core::Runtime::runtime_singleton_->CopyMemory( + dst, *dst_agent, src, *src_agent, size, dep_signal_list, + *out_signal_obj); + } + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t HSA_API + hsa_amd_profiling_set_profiler_enabled(hsa_queue_t* queue, int enable) { + IS_OPEN(); + + core::Queue* cmd_queue = core::Queue::Convert(queue); + + IS_VALID(cmd_queue); + + AMD_HSA_BITS_SET(cmd_queue->amd_queue_.queue_properties, + AMD_QUEUE_PROPERTIES_ENABLE_PROFILING, (enable != 0)); + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t HSA_API hsa_amd_profiling_get_dispatch_time( + hsa_agent_t agent_handle, hsa_signal_t hsa_signal, + hsa_amd_profiling_dispatch_time_t* time) { + IS_OPEN(); + + IS_BAD_PTR(time); + + core::Agent* agent = core::Agent::Convert(agent_handle); + + IS_VALID(agent); + + core::Signal* signal = core::Signal::Convert(hsa_signal); + + IS_VALID(signal); + + if (agent->device_type() != core::Agent::kAmdGpuDevice) { + return HSA_STATUS_ERROR_INVALID_AGENT; + } + + amd::GpuAgentInt* gpu_agent = static_cast(agent); + + gpu_agent->TranslateTime(signal, *time); + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t HSA_API + hsa_amd_profiling_convert_tick_to_system_domain(hsa_agent_t agent_handle, + uint64_t agent_tick, + uint64_t* system_tick) { + IS_OPEN(); + + IS_BAD_PTR(system_tick); + + core::Agent* agent = core::Agent::Convert(agent_handle); + + IS_VALID(agent); + + if (agent->device_type() != core::Agent::kAmdGpuDevice) { + return HSA_STATUS_ERROR_INVALID_AGENT; + } + + amd::GpuAgentInt* gpu_agent = static_cast(agent); + + *system_tick = gpu_agent->TranslateTime(agent_tick); + + return HSA_STATUS_SUCCESS; +} + +uint32_t HSA_API + hsa_amd_signal_wait_any(uint32_t signal_count, hsa_signal_t* hsa_signals, + hsa_signal_condition_t* conds, + hsa_signal_value_t* values, uint64_t timeout_hint, + hsa_wait_state_t wait_hint, + hsa_signal_value_t* satisfying_value) { + // Do not check for signal invalidation. Invalidation may occur during async + // signal handler loop and is not an error. + for (uint i = 0; i < signal_count; i++) + assert(hsa_signals[i].handle != 0 && + static_cast*>( + core::Signal::Convert(hsa_signals[i]))->IsValid() && + "Invalid signal."); + + return core::Signal::WaitAny(signal_count, hsa_signals, conds, values, + timeout_hint, wait_hint, satisfying_value); +} + +hsa_status_t HSA_API + hsa_amd_signal_async_handler(hsa_signal_t hsa_signal, + hsa_signal_condition_t cond, + hsa_signal_value_t value, + hsa_amd_signal_handler handler, void* arg) { + IS_OPEN(); + + core::Signal* signal = core::Signal::Convert(hsa_signal); + IS_VALID(signal); + IS_BAD_PTR(handler); + if (!core::InterruptSignal::IsType(signal)) + return HSA_STATUS_ERROR_INVALID_SIGNAL; + return core::Runtime::runtime_singleton_->SetAsyncSignalHandler( + hsa_signal, cond, value, handler, arg); +} + +hsa_status_t HSA_API + hsa_amd_async_function(void (*callback)(void* arg), void* arg) { + IS_OPEN(); + + IS_BAD_PTR(callback); + static const hsa_signal_t null_signal = {0}; + return core::Runtime::runtime_singleton_->SetAsyncSignalHandler( + null_signal, HSA_SIGNAL_CONDITION_EQ, 0, (hsa_amd_signal_handler)callback, + arg); +} + +hsa_status_t HSA_API hsa_amd_queue_cu_set_mask(const hsa_queue_t* queue, + uint32_t num_cu_mask_count, + const uint32_t* cu_mask) { + IS_OPEN(); + IS_BAD_PTR(cu_mask); + + core::Queue* cmd_queue = core::Queue::Convert(queue); + IS_VALID(cmd_queue); + return cmd_queue->SetCUMasking(num_cu_mask_count, cu_mask); +} + +hsa_status_t HSA_API hsa_amd_memory_lock(void* host_ptr, size_t size, + hsa_agent_t* agents, int num_agent, + void** agent_ptr) { + *agent_ptr = NULL; + + IS_OPEN(); + + if (size == 0 || host_ptr == NULL || agent_ptr == NULL) { + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + + if ((agents != NULL && num_agent == 0) || + (agents == NULL && num_agent != 0)) { + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + + const amd::MemoryRegion* system_region = + reinterpret_cast( + core::Runtime::runtime_singleton_->system_regions_fine()[0]); + + return system_region->Lock(num_agent, agents, host_ptr, size, agent_ptr); +} + +hsa_status_t HSA_API hsa_amd_memory_unlock(void* host_ptr) { + IS_OPEN(); + + const amd::MemoryRegion* system_region = + reinterpret_cast( + core::Runtime::runtime_singleton_->system_regions_fine()[0]); + + return system_region->Unlock(host_ptr); +} + +hsa_status_t HSA_API + hsa_amd_memory_pool_get_info(hsa_amd_memory_pool_t memory_pool, + hsa_amd_memory_pool_info_t attribute, + void* value) { + IS_OPEN(); + IS_BAD_PTR(value); + + hsa_region_t region = {memory_pool.handle}; + const amd::MemoryRegion* mem_region = amd::MemoryRegion::Convert(region); + if (mem_region == NULL) { + return (hsa_status_t)HSA_STATUS_ERROR_INVALID_MEMORY_POOL; + } + + return mem_region->GetPoolInfo(attribute, value); +} + +hsa_status_t HSA_API hsa_amd_agent_iterate_memory_pools( + hsa_agent_t agent_handle, + hsa_status_t (*callback)(hsa_amd_memory_pool_t memory_pool, void* data), + void* data) { + IS_OPEN(); + IS_BAD_PTR(callback); + const core::Agent* agent = core::Agent::Convert(agent_handle); + IS_VALID(agent); + + if (agent->device_type() == core::Agent::kAmdCpuDevice) { + return reinterpret_cast(agent)->VisitRegion( + false, reinterpret_cast(callback), + data); + } + + return reinterpret_cast(agent)->VisitRegion( + false, + reinterpret_cast( + callback), + data); +} + +hsa_status_t HSA_API + hsa_amd_memory_pool_allocate(hsa_amd_memory_pool_t memory_pool, size_t size, + uint32_t flags, void** ptr) { + IS_OPEN(); + + if (size == 0 || ptr == NULL) { + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + + hsa_region_t region = {memory_pool.handle}; + const core::MemoryRegion* mem_region = core::MemoryRegion::Convert(region); + + if (mem_region == NULL || !mem_region->IsValid()) { + return (hsa_status_t)HSA_STATUS_ERROR_INVALID_MEMORY_POOL; + } + + return core::Runtime::runtime_singleton_->AllocateMemory(true, mem_region, + size, ptr); +} + +hsa_status_t HSA_API hsa_amd_memory_pool_free(void* ptr) { + return HSA::hsa_memory_free(ptr); +} + +hsa_status_t HSA_API + hsa_amd_agents_allow_access(uint32_t num_agents, const hsa_agent_t* agents, + const uint32_t* flags, const void* ptr) { + IS_OPEN(); + + if (num_agents == 0 || agents == NULL || flags != NULL || ptr == NULL) { + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + + return core::Runtime::runtime_singleton_->AllowAccess(num_agents, agents, + ptr); +} + +hsa_status_t HSA_API + hsa_amd_memory_pool_can_migrate(hsa_amd_memory_pool_t src_memory_pool, + hsa_amd_memory_pool_t dst_memory_pool, + bool* result) { + IS_OPEN(); + + if (result == NULL) { + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + + hsa_region_t src_region_handle = {src_memory_pool.handle}; + const amd::MemoryRegion* src_mem_region = + amd::MemoryRegion::Convert(src_region_handle); + + if (src_mem_region == NULL || !src_mem_region->IsValid()) { + return static_cast(HSA_STATUS_ERROR_INVALID_MEMORY_POOL); + } + + hsa_region_t dst_region_handle = {dst_memory_pool.handle}; + const amd::MemoryRegion* dst_mem_region = + amd::MemoryRegion::Convert(dst_region_handle); + + if (dst_mem_region == NULL || !dst_mem_region->IsValid()) { + return static_cast(HSA_STATUS_ERROR_INVALID_MEMORY_POOL); + } + + return src_mem_region->CanMigrate(*dst_mem_region, *result); +} + +hsa_status_t HSA_API hsa_amd_memory_migrate(const void* ptr, + hsa_amd_memory_pool_t memory_pool, + uint32_t flags) { + IS_OPEN(); + + if (ptr == NULL || flags != 0) { + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + + hsa_region_t dst_region_handle = {memory_pool.handle}; + const amd::MemoryRegion* dst_mem_region = + amd::MemoryRegion::Convert(dst_region_handle); + + if (dst_mem_region == NULL || !dst_mem_region->IsValid()) { + return static_cast(HSA_STATUS_ERROR_INVALID_MEMORY_POOL); + } + + return dst_mem_region->Migrate(flags, ptr); +} + +hsa_status_t HSA_API hsa_amd_agent_memory_pool_get_info( + hsa_agent_t agent_handle, hsa_amd_memory_pool_t memory_pool, + hsa_amd_agent_memory_pool_info_t attribute, void* value) { + IS_OPEN(); + + if (value == NULL) { + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + + const core::Agent* agent = core::Agent::Convert(agent_handle); + IS_VALID(agent); + + hsa_region_t region_handle = {memory_pool.handle}; + const amd::MemoryRegion* mem_region = + amd::MemoryRegion::Convert(region_handle); + + if (mem_region == NULL || !mem_region->IsValid()) { + return static_cast(HSA_STATUS_ERROR_INVALID_MEMORY_POOL); + } + + return mem_region->GetAgentPoolInfo(*agent, attribute, value); +} + +hsa_status_t hsa_amd_interop_map_buffer(uint32_t num_agents, + hsa_agent_t* agents, int interop_handle, + uint32_t flags, size_t* size, + void** ptr, size_t* metadata_size, + const void** metadata) { + IS_OPEN(); + IS_BAD_PTR(agents); + IS_BAD_PTR(size); + IS_BAD_PTR(ptr); + if (flags != 0) return HSA_STATUS_ERROR_INVALID_ARGUMENT; + if (num_agents == 0) return HSA_STATUS_ERROR_INVALID_ARGUMENT; + + core::Agent* short_agents[64]; + core::Agent** core_agents = short_agents; + if (num_agents > 64) { + core_agents = new core::Agent* [num_agents]; + if (core_agents == NULL) return HSA_STATUS_ERROR_OUT_OF_RESOURCES; + } + + for (int i = 0; i < num_agents; i++) { + core::Agent* device = core::Agent::Convert(agents[i]); + IS_VALID(device); + core_agents[i] = device; + } + + auto ret = core::Runtime::runtime_singleton_->InteropMap( + num_agents, core_agents, interop_handle, flags, size, ptr, metadata_size, + metadata); + + if (num_agents > 64) delete[] core_agents; + return ret; +} + +hsa_status_t hsa_amd_interop_unmap_buffer(void* ptr) { + IS_OPEN(); + if (ptr != NULL) core::Runtime::runtime_singleton_->InteropUnmap(ptr); + return HSA_STATUS_SUCCESS; +} diff --git a/runtime/hsa-runtime/core/runtime/hsa_ext_interface.cpp b/runtime/hsa-runtime/core/runtime/hsa_ext_interface.cpp new file mode 100644 index 0000000000..268e98ebdf --- /dev/null +++ b/runtime/hsa-runtime/core/runtime/hsa_ext_interface.cpp @@ -0,0 +1,530 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#include "core/inc/hsa_ext_interface.h" + +#include "core/inc/runtime.h" + +namespace core { +// Implementations for missing / unsupported extensions +template +static T0 hsa_ext_null() { + return HSA_STATUS_ERROR_NOT_INITIALIZED; +} +template +static T0 hsa_ext_null(T1) { + return HSA_STATUS_ERROR_NOT_INITIALIZED; +} +template +static T0 hsa_ext_null(T1, T2) { + return HSA_STATUS_ERROR_NOT_INITIALIZED; +} +template +static T0 hsa_ext_null(T1, T2, T3) { + return HSA_STATUS_ERROR_NOT_INITIALIZED; +} +template +static T0 hsa_ext_null(T1, T2, T3, T4) { + return HSA_STATUS_ERROR_NOT_INITIALIZED; +} +template +static T0 hsa_ext_null(T1, T2, T3, T4, T5) { + return HSA_STATUS_ERROR_NOT_INITIALIZED; +} +template +static T0 hsa_ext_null(T1, T2, T3, T4, T5, T6) { + return HSA_STATUS_ERROR_NOT_INITIALIZED; +} +template +static T0 hsa_ext_null(T1, T2, T3, T4, T5, T6, T7) { + return HSA_STATUS_ERROR_NOT_INITIALIZED; +} +template +static T0 hsa_ext_null(T1, T2, T3, T4, T5, T6, T7, T8) { + return HSA_STATUS_ERROR_NOT_INITIALIZED; +} +template +static T0 hsa_ext_null(T1, T2, T3, T4, T5, T6, T7, T8, T9) { + return HSA_STATUS_ERROR_NOT_INITIALIZED; +} +template +static T0 hsa_ext_null(T1, T2, T3, T4, T5, T6, T7, T8, T9, T10) { + return HSA_STATUS_ERROR_NOT_INITIALIZED; +} +template +static T0 hsa_ext_null(T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11) { + return HSA_STATUS_ERROR_NOT_INITIALIZED; +} +template +static T0 hsa_ext_null(T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12) { + return HSA_STATUS_ERROR_NOT_INITIALIZED; +} +template +static T0 hsa_ext_null(T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13) { + return HSA_STATUS_ERROR_NOT_INITIALIZED; +} +template +static T0 hsa_ext_null(T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, + T14) { + return HSA_STATUS_ERROR_NOT_INITIALIZED; +} +template +static T0 hsa_ext_null(T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, + T14, T15) { + return HSA_STATUS_ERROR_NOT_INITIALIZED; +} +template +static T0 hsa_ext_null(T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, + T14, T15, T16) { + return HSA_STATUS_ERROR_NOT_INITIALIZED; +} +template +static T0 hsa_ext_null(T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, + T14, T15, T16, T17) { + return HSA_STATUS_ERROR_NOT_INITIALIZED; +} +template +static T0 hsa_ext_null(T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, + T14, T15, T16, T17, T18) { + return HSA_STATUS_ERROR_NOT_INITIALIZED; +} +template +static T0 hsa_ext_null(T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, + T14, T15, T16, T17, T18, T19) { + return HSA_STATUS_ERROR_NOT_INITIALIZED; +} +template +static T0 hsa_ext_null(T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, + T14, T15, T16, T17, T18, T19, T20) { + return HSA_STATUS_ERROR_NOT_INITIALIZED; +} + +ExtensionEntryPoints::ExtensionEntryPoints() { InitTable(); } + +void ExtensionEntryPoints::InitTable() { + table.hsa_ext_program_create_fn = hsa_ext_null; + table.hsa_ext_program_destroy_fn = hsa_ext_null; + table.hsa_ext_program_add_module_fn = hsa_ext_null; + table.hsa_ext_program_iterate_modules_fn = hsa_ext_null; + table.hsa_ext_program_get_info_fn = hsa_ext_null; + table.hsa_ext_program_finalize_fn = hsa_ext_null; + table.hsa_ext_image_get_capability_fn = hsa_ext_null; + table.hsa_ext_image_data_get_info_fn = hsa_ext_null; + table.hsa_ext_image_create_fn = hsa_ext_null; + table.hsa_ext_image_import_fn = hsa_ext_null; + table.hsa_ext_image_export_fn = hsa_ext_null; + table.hsa_ext_image_copy_fn = hsa_ext_null; + table.hsa_ext_image_clear_fn = hsa_ext_null; + table.hsa_ext_image_destroy_fn = hsa_ext_null; + table.hsa_ext_sampler_create_fn = hsa_ext_null; + table.hsa_ext_sampler_destroy_fn = hsa_ext_null; + table.hsa_amd_image_get_info_max_dim_fn = hsa_ext_null; + table.hsa_amd_image_create_fn = hsa_ext_null; +} + +void ExtensionEntryPoints::Unload() { + for (int i = 0; i < libs_.size(); i++) { + void* ptr = os::GetExportAddress(libs_[i], "Unload"); + if (ptr) { + ((Unload_t)ptr)(); + } + } + // Due to valgrind bug, runtime cannot dlclose extensions see: + // http://valgrind.org/docs/manual/faq.html#faq.unhelpful + if (os::GetEnvVar("HSA_RUNNING_UNDER_VALGRIND") != "1") { + for (int i = 0; i < libs_.size(); i++) { + os::CloseLib(libs_[i]); + } + } + libs_.clear(); + InitTable(); +} + +bool ExtensionEntryPoints::Load(std::string library_name) { + os::LibHandle lib = os::LoadLib(library_name); + if (lib == NULL) { + return false; + } + libs_.push_back(lib); + + void* ptr; + + ptr = os::GetExportAddress(lib, "hsa_ext_program_create_impl"); + if (ptr != NULL) { + assert(table.hsa_ext_program_create_fn == + (decltype(::hsa_ext_program_create)*)hsa_ext_null && + "Duplicate load of extension import."); + table.hsa_ext_program_create_fn = (decltype(::hsa_ext_program_create)*)ptr; + } + + ptr = os::GetExportAddress(lib, "hsa_ext_program_destroy_impl"); + if (ptr != NULL) { + assert(table.hsa_ext_program_destroy_fn == + (decltype(::hsa_ext_program_destroy)*)hsa_ext_null && + "Duplicate load of extension import."); + table.hsa_ext_program_destroy_fn = + (decltype(::hsa_ext_program_destroy)*)ptr; + } + + ptr = os::GetExportAddress(lib, "hsa_ext_program_add_module_impl"); + if (ptr != NULL) { + assert(table.hsa_ext_program_add_module_fn == + (decltype(::hsa_ext_program_add_module)*)hsa_ext_null && + "Duplicate load of extension import."); + table.hsa_ext_program_add_module_fn = + (decltype(::hsa_ext_program_add_module)*)ptr; + } + + ptr = os::GetExportAddress(lib, "hsa_ext_program_iterate_modules_impl"); + if (ptr != NULL) { + assert(table.hsa_ext_program_iterate_modules_fn == + (decltype(::hsa_ext_program_iterate_modules)*)hsa_ext_null && + "Duplicate load of extension import."); + table.hsa_ext_program_iterate_modules_fn = + (decltype(::hsa_ext_program_iterate_modules)*)ptr; + } + + ptr = os::GetExportAddress(lib, "hsa_ext_program_get_info_impl"); + if (ptr != NULL) { + assert(table.hsa_ext_program_get_info_fn == + (decltype(::hsa_ext_program_get_info)*)hsa_ext_null && + "Duplicate load of extension import."); + table.hsa_ext_program_get_info_fn = + (decltype(::hsa_ext_program_get_info)*)ptr; + } + + ptr = os::GetExportAddress(lib, "hsa_ext_program_finalize_impl"); + if (ptr != NULL) { + assert(table.hsa_ext_program_finalize_fn == + (decltype(::hsa_ext_program_finalize)*)hsa_ext_null && + "Duplicate load of extension import."); + table.hsa_ext_program_finalize_fn = + (decltype(::hsa_ext_program_finalize)*)ptr; + } + + ptr = os::GetExportAddress(lib, "hsa_ext_image_get_capability_impl"); + if (ptr != NULL) { + assert(table.hsa_ext_image_get_capability_fn == + (decltype(::hsa_ext_image_get_capability)*)hsa_ext_null && + "Duplicate load of extension import."); + table.hsa_ext_image_get_capability_fn = + (decltype(::hsa_ext_image_get_capability)*)ptr; + } + + ptr = os::GetExportAddress(lib, "hsa_ext_image_data_get_info_impl"); + if (ptr != NULL) { + assert(table.hsa_ext_image_data_get_info_fn == + (decltype(::hsa_ext_image_data_get_info)*)hsa_ext_null && + "Duplicate load of extension import."); + table.hsa_ext_image_data_get_info_fn = + (decltype(::hsa_ext_image_data_get_info)*)ptr; + } + + ptr = os::GetExportAddress(lib, "hsa_ext_image_create_impl"); + if (ptr != NULL) { + assert(table.hsa_ext_image_create_fn == + (decltype(::hsa_ext_image_create)*)hsa_ext_null && + "Duplicate load of extension import."); + table.hsa_ext_image_create_fn = (decltype(::hsa_ext_image_create)*)ptr; + } + + ptr = os::GetExportAddress(lib, "hsa_ext_image_import_impl"); + if (ptr != NULL) { + assert(table.hsa_ext_image_import_fn == + (decltype(::hsa_ext_image_import)*)hsa_ext_null && + "Duplicate load of extension import."); + table.hsa_ext_image_import_fn = (decltype(::hsa_ext_image_import)*)ptr; + } + + ptr = os::GetExportAddress(lib, "hsa_ext_image_export_impl"); + if (ptr != NULL) { + assert(table.hsa_ext_image_export_fn == + (decltype(::hsa_ext_image_export)*)hsa_ext_null && + "Duplicate load of extension import."); + table.hsa_ext_image_export_fn = (decltype(::hsa_ext_image_export)*)ptr; + } + + ptr = os::GetExportAddress(lib, "hsa_ext_image_copy_impl"); + if (ptr != NULL) { + assert(table.hsa_ext_image_copy_fn == + (decltype(::hsa_ext_image_copy)*)hsa_ext_null && + "Duplicate load of extension import."); + table.hsa_ext_image_copy_fn = (decltype(::hsa_ext_image_copy)*)ptr; + } + + ptr = os::GetExportAddress(lib, "hsa_ext_image_clear_impl"); + if (ptr != NULL) { + assert(table.hsa_ext_image_clear_fn == + (decltype(::hsa_ext_image_clear)*)hsa_ext_null && + "Duplicate load of extension import."); + table.hsa_ext_image_clear_fn = (decltype(::hsa_ext_image_clear)*)ptr; + } + + ptr = os::GetExportAddress(lib, "hsa_ext_image_destroy_impl"); + if (ptr != NULL) { + assert(table.hsa_ext_image_destroy_fn == + (decltype(::hsa_ext_image_destroy)*)hsa_ext_null && + "Duplicate load of extension import."); + table.hsa_ext_image_destroy_fn = (decltype(::hsa_ext_image_destroy)*)ptr; + } + + ptr = os::GetExportAddress(lib, "hsa_ext_sampler_create_impl"); + if (ptr != NULL) { + assert(table.hsa_ext_sampler_create_fn == + (decltype(::hsa_ext_sampler_create)*)hsa_ext_null && + "Duplicate load of extension import."); + table.hsa_ext_sampler_create_fn = (decltype(::hsa_ext_sampler_create)*)ptr; + } + + ptr = os::GetExportAddress(lib, "hsa_ext_sampler_destroy_impl"); + if (ptr != NULL) { + assert(table.hsa_ext_sampler_destroy_fn == + (decltype(::hsa_ext_sampler_destroy)*)hsa_ext_null && + "Duplicate load of extension import."); + table.hsa_ext_sampler_destroy_fn = + (decltype(::hsa_ext_sampler_destroy)*)ptr; + } + + ptr = os::GetExportAddress(lib, "hsa_amd_image_get_info_max_dim_impl"); + if (ptr != NULL) { + assert(table.hsa_amd_image_get_info_max_dim_fn == + (decltype(::hsa_amd_image_get_info_max_dim)*)hsa_ext_null && + "Duplicate load of extension import."); + table.hsa_amd_image_get_info_max_dim_fn = + (decltype(::hsa_amd_image_get_info_max_dim)*)ptr; + } + + ptr = os::GetExportAddress(lib, "hsa_amd_image_create_impl"); + if (ptr != NULL) { + assert(table.hsa_amd_image_create_fn == + (decltype(::hsa_amd_image_create)*)hsa_ext_null && + "Duplicate load of extension import."); + table.hsa_amd_image_create_fn = + (decltype(::hsa_amd_image_create)*)ptr; + } + + core::hsa_internal_api_table_.extension_backup=table; + core::hsa_internal_api_table_.table.std_exts_=&core::hsa_internal_api_table_.extension_backup; + + ptr = os::GetExportAddress(lib, "Load"); + if (ptr != NULL) { + ((Load_t)ptr)(&core::hsa_internal_api_table_.table); + } + + return true; +} +} // namespace core + +//---------------------------------------------------------------------------// +// Exported extension stub functions +//---------------------------------------------------------------------------// + +hsa_status_t hsa_ext_program_create( + hsa_machine_model_t machine_model, hsa_profile_t profile, + hsa_default_float_rounding_mode_t default_float_rounding_mode, + const char* options, hsa_ext_program_t* program) { + return core::Runtime::runtime_singleton_->extensions_.table + .hsa_ext_program_create_fn(machine_model, profile, + default_float_rounding_mode, options, program); +} + +hsa_status_t hsa_ext_program_destroy(hsa_ext_program_t program) { + return core::Runtime::runtime_singleton_->extensions_.table + .hsa_ext_program_destroy_fn(program); +} + +hsa_status_t hsa_ext_program_add_module(hsa_ext_program_t program, + hsa_ext_module_t module) { + return core::Runtime::runtime_singleton_->extensions_.table + .hsa_ext_program_add_module_fn(program, module); +} + +hsa_status_t hsa_ext_program_iterate_modules( + hsa_ext_program_t program, + hsa_status_t (*callback)(hsa_ext_program_t program, hsa_ext_module_t module, + void* data), + void* data) { + return core::Runtime::runtime_singleton_->extensions_.table + .hsa_ext_program_iterate_modules_fn(program, callback, data); +} + +hsa_status_t hsa_ext_program_get_info(hsa_ext_program_t program, + hsa_ext_program_info_t attribute, + void* value) { + return core::Runtime::runtime_singleton_->extensions_.table + .hsa_ext_program_get_info_fn(program, attribute, value); +} + +hsa_status_t hsa_ext_program_finalize( + hsa_ext_program_t program, hsa_isa_t isa, int32_t call_convention, + hsa_ext_control_directives_t control_directives, const char* options, + hsa_code_object_type_t code_object_type, hsa_code_object_t* code_object) { + return core::Runtime::runtime_singleton_->extensions_.table + .hsa_ext_program_finalize_fn(program, isa, call_convention, + control_directives, options, + code_object_type, code_object); +} + +hsa_status_t hsa_ext_image_get_capability( + hsa_agent_t agent, hsa_ext_image_geometry_t geometry, + const hsa_ext_image_format_t* image_format, uint32_t* capability_mask) { + return core::Runtime::runtime_singleton_->extensions_.table + .hsa_ext_image_get_capability_fn(agent, geometry, image_format, + capability_mask); +} + +hsa_status_t hsa_ext_image_data_get_info( + hsa_agent_t agent, const hsa_ext_image_descriptor_t* image_descriptor, + hsa_access_permission_t access_permission, + hsa_ext_image_data_info_t* image_data_info) { + return core::Runtime::runtime_singleton_->extensions_.table + .hsa_ext_image_data_get_info_fn(agent, image_descriptor, + access_permission, image_data_info); +} + +hsa_status_t hsa_ext_image_create( + hsa_agent_t agent, const hsa_ext_image_descriptor_t* image_descriptor, + const void* image_data, hsa_access_permission_t access_permission, + hsa_ext_image_t* image) { + return core::Runtime::runtime_singleton_->extensions_.table + .hsa_ext_image_create_fn(agent, image_descriptor, image_data, + access_permission, image); +} + +hsa_status_t hsa_ext_image_import(hsa_agent_t agent, const void* src_memory, + size_t src_row_pitch, size_t src_slice_pitch, + hsa_ext_image_t dst_image, + const hsa_ext_image_region_t* image_region) { + return core::Runtime::runtime_singleton_->extensions_.table + .hsa_ext_image_import_fn(agent, src_memory, src_row_pitch, + src_slice_pitch, dst_image, image_region); +} + +hsa_status_t hsa_ext_image_export(hsa_agent_t agent, hsa_ext_image_t src_image, + void* dst_memory, size_t dst_row_pitch, + size_t dst_slice_pitch, + const hsa_ext_image_region_t* image_region) { + return core::Runtime::runtime_singleton_->extensions_.table + .hsa_ext_image_export_fn(agent, src_image, dst_memory, dst_row_pitch, + dst_slice_pitch, image_region); +} + +hsa_status_t hsa_ext_image_copy(hsa_agent_t agent, hsa_ext_image_t src_image, + const hsa_dim3_t* src_offset, + hsa_ext_image_t dst_image, + const hsa_dim3_t* dst_offset, + const hsa_dim3_t* range) { + return core::Runtime::runtime_singleton_->extensions_.table + .hsa_ext_image_copy_fn(agent, src_image, src_offset, dst_image, + dst_offset, range); +} + +hsa_status_t hsa_ext_image_clear(hsa_agent_t agent, hsa_ext_image_t image, + const void* data, + const hsa_ext_image_region_t* image_region) { + return core::Runtime::runtime_singleton_->extensions_.table + .hsa_ext_image_clear_fn(agent, image, data, image_region); +} + +hsa_status_t hsa_ext_image_destroy(hsa_agent_t agent, hsa_ext_image_t image) { + return core::Runtime::runtime_singleton_->extensions_.table + .hsa_ext_image_destroy_fn(agent, image); +} + +hsa_status_t hsa_ext_sampler_create( + hsa_agent_t agent, const hsa_ext_sampler_descriptor_t* sampler_descriptor, + hsa_ext_sampler_t* sampler) { + return core::Runtime::runtime_singleton_->extensions_.table + .hsa_ext_sampler_create_fn(agent, sampler_descriptor, sampler); +} + +hsa_status_t hsa_ext_sampler_destroy(hsa_agent_t agent, + hsa_ext_sampler_t sampler) { + return core::Runtime::runtime_singleton_->extensions_.table + .hsa_ext_sampler_destroy_fn(agent, sampler); +} + +//---------------------------------------------------------------------------// +// Stubs for internal extension functions +//---------------------------------------------------------------------------// + +hsa_status_t hsa_amd_image_get_info_max_dim(hsa_agent_t component, + hsa_agent_info_t attribute, + void* value) { + return core::Runtime::runtime_singleton_->extensions_.table + .hsa_amd_image_get_info_max_dim_fn(component, attribute, value); +} + +hsa_status_t hsa_amd_image_create( + hsa_agent_t agent, + const hsa_ext_image_descriptor_t *image_descriptor, + const hsa_amd_image_descriptor_t *image_layout, + const void *image_data, + hsa_access_permission_t access_permission, + hsa_ext_image_t *image) { + return core::Runtime::runtime_singleton_->extensions_.table + .hsa_amd_image_create_fn(agent, image_descriptor, image_layout, image_data, access_permission, image); +} diff --git a/runtime/hsa-runtime/core/runtime/interrupt_signal.cpp b/runtime/hsa-runtime/core/runtime/interrupt_signal.cpp new file mode 100644 index 0000000000..67c95867dd --- /dev/null +++ b/runtime/hsa-runtime/core/runtime/interrupt_signal.cpp @@ -0,0 +1,372 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#include "core/inc/interrupt_signal.h" +#include "core/util/timer.h" + +namespace core { + +HsaEvent* InterruptSignal::CreateEvent(HSA_EVENTTYPE type, bool manual_reset) { + HsaEventDescriptor event_descriptor; + event_descriptor.EventType = type; + event_descriptor.SyncVar.SyncVar.UserData = NULL; + event_descriptor.SyncVar.SyncVarSize = sizeof(hsa_signal_value_t); + event_descriptor.NodeId = 0; + + HsaEvent* ret = NULL; + if (HSAKMT_STATUS_SUCCESS == + hsaKmtCreateEvent(&event_descriptor, manual_reset, false, &ret)) { + if (type == HSA_EVENTTYPE_MEMORY) { + memset(&ret->EventData.EventData.MemoryAccessFault.Failure, 0, + sizeof(HsaAccessAttributeFailure)); + } + } + + return ret; +} + +int InterruptSignal::rtti_id_ = 0; + +void InterruptSignal::DestroyEvent(HsaEvent* evt) { hsaKmtDestroyEvent(evt); } + +InterruptSignal::InterruptSignal(hsa_signal_value_t initial_value, + HsaEvent* use_event) + : Signal(initial_value) { + if (use_event != NULL) { + event_ = use_event; + free_event_ = false; + } else { + event_ = CreateEvent(HSA_EVENTTYPE_SIGNAL, false); + free_event_ = true; + } + + if (event_ != NULL) { + signal_.event_id = event_->EventId; + signal_.event_mailbox_ptr = event_->EventData.HWData2; + } else { + signal_.event_id = 0; + signal_.event_mailbox_ptr = 0; + } + signal_.kind = AMD_SIGNAL_KIND_USER; + + wait_on_event_ = true; +} + +InterruptSignal::~InterruptSignal() { + invalid_ = true; + SetEvent(); + while (InUse()) + ; + if (free_event_) hsaKmtDestroyEvent(event_); +} + +hsa_signal_value_t InterruptSignal::LoadRelaxed() { + return hsa_signal_value_t( + atomic::Load(&signal_.value, std::memory_order_relaxed)); +} + +hsa_signal_value_t InterruptSignal::LoadAcquire() { + return hsa_signal_value_t( + atomic::Load(&signal_.value, std::memory_order_acquire)); +} + +void InterruptSignal::StoreRelaxed(hsa_signal_value_t value) { + wait_on_event_ = true; + atomic::Store(&signal_.value, int64_t(value), std::memory_order_relaxed); + SetEvent(); +} + +void InterruptSignal::StoreRelease(hsa_signal_value_t value) { + wait_on_event_ = true; + atomic::Store(&signal_.value, int64_t(value), std::memory_order_release); + SetEvent(); +} + +hsa_signal_value_t InterruptSignal::WaitRelaxed( + hsa_signal_condition_t condition, hsa_signal_value_t compare_value, + uint64_t timeout, hsa_wait_state_t wait_hint) { + uint32_t prior = atomic::Increment(&waiting_); + + // assert(prior == 0 && "Multiple waiters on interrupt signal!"); + // Allow only the first waiter to sleep (temporary, known to be bad). + if (prior != 0) wait_hint = HSA_WAIT_STATE_ACTIVE; + + MAKE_SCOPE_GUARD([&]() { atomic::Decrement(&waiting_); }); + + int64_t value; + + timer::fast_clock::time_point start_time = timer::fast_clock::now(); + + // Set a polling timeout value + // Exact time is not hugely important, it should just be a short while which + // is smaller than the thread scheduling quantum (usually around 16ms) + const timer::fast_clock::duration kMaxElapsed = std::chrono::milliseconds(5); + + uint64_t hsa_freq; + HSA::hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &hsa_freq); + const timer::fast_clock::duration fast_timeout = + timer::duration_from_seconds( + double(timeout) / double(hsa_freq)); + + bool condition_met = false; + while (true) { + if (invalid_) return 0; + + value = atomic::Load(&signal_.value, std::memory_order_relaxed); + + switch (condition) { + case HSA_SIGNAL_CONDITION_EQ: { + condition_met = (value == compare_value); + break; + } + case HSA_SIGNAL_CONDITION_NE: { + condition_met = (value != compare_value); + break; + } + case HSA_SIGNAL_CONDITION_GTE: { + condition_met = (value >= compare_value); + break; + } + case HSA_SIGNAL_CONDITION_LT: { + condition_met = (value < compare_value); + break; + } + default: + return 0; + } + if (condition_met) return hsa_signal_value_t(value); + + timer::fast_clock::time_point time = timer::fast_clock::now(); + if (time - start_time > kMaxElapsed) { + if (time - start_time > fast_timeout) { + value = atomic::Load(&signal_.value, std::memory_order_relaxed); + return hsa_signal_value_t(value); + } + if (wait_on_event_ && wait_hint != HSA_WAIT_STATE_ACTIVE) { + uint32_t wait_ms; + auto time_remaining = fast_timeout - (time - start_time); + if ((timeout == -1) || + (time_remaining > std::chrono::milliseconds(uint32_t(-1)))) + wait_ms = uint32_t(-1); + else + wait_ms = timer::duration_cast( + time_remaining).count(); + hsaKmtWaitOnEvent(event_, wait_ms); + } + } + } +} + +hsa_signal_value_t InterruptSignal::WaitAcquire( + hsa_signal_condition_t condition, hsa_signal_value_t compare_value, + uint64_t timeout, hsa_wait_state_t wait_hint) { + hsa_signal_value_t ret = + WaitRelaxed(condition, compare_value, timeout, wait_hint); + std::atomic_thread_fence(std::memory_order_acquire); + return ret; +} + +void InterruptSignal::AndRelaxed(hsa_signal_value_t value) { + atomic::And(&signal_.value, int64_t(value), std::memory_order_relaxed); + SetEvent(); +} + +void InterruptSignal::AndAcquire(hsa_signal_value_t value) { + atomic::And(&signal_.value, int64_t(value), std::memory_order_acquire); + SetEvent(); +} + +void InterruptSignal::AndRelease(hsa_signal_value_t value) { + atomic::And(&signal_.value, int64_t(value), std::memory_order_release); + SetEvent(); +} + +void InterruptSignal::AndAcqRel(hsa_signal_value_t value) { + atomic::And(&signal_.value, int64_t(value), std::memory_order_acq_rel); + SetEvent(); +} + +void InterruptSignal::OrRelaxed(hsa_signal_value_t value) { + atomic::Or(&signal_.value, int64_t(value), std::memory_order_relaxed); + SetEvent(); +} + +void InterruptSignal::OrAcquire(hsa_signal_value_t value) { + atomic::Or(&signal_.value, int64_t(value), std::memory_order_acquire); + SetEvent(); +} + +void InterruptSignal::OrRelease(hsa_signal_value_t value) { + atomic::Or(&signal_.value, int64_t(value), std::memory_order_release); + SetEvent(); +} + +void InterruptSignal::OrAcqRel(hsa_signal_value_t value) { + atomic::Or(&signal_.value, int64_t(value), std::memory_order_acq_rel); + SetEvent(); +} + +void InterruptSignal::XorRelaxed(hsa_signal_value_t value) { + atomic::Xor(&signal_.value, int64_t(value), std::memory_order_relaxed); + SetEvent(); +} + +void InterruptSignal::XorAcquire(hsa_signal_value_t value) { + atomic::Xor(&signal_.value, int64_t(value), std::memory_order_acquire); + SetEvent(); +} + +void InterruptSignal::XorRelease(hsa_signal_value_t value) { + atomic::Xor(&signal_.value, int64_t(value), std::memory_order_release); + SetEvent(); +} + +void InterruptSignal::XorAcqRel(hsa_signal_value_t value) { + atomic::Xor(&signal_.value, int64_t(value), std::memory_order_acq_rel); + SetEvent(); +} + +void InterruptSignal::AddRelaxed(hsa_signal_value_t value) { + atomic::Add(&signal_.value, int64_t(value), std::memory_order_relaxed); + SetEvent(); +} + +void InterruptSignal::AddAcquire(hsa_signal_value_t value) { + atomic::Add(&signal_.value, int64_t(value), std::memory_order_acquire); + SetEvent(); +} + +void InterruptSignal::AddRelease(hsa_signal_value_t value) { + atomic::Add(&signal_.value, int64_t(value), std::memory_order_release); + SetEvent(); +} + +void InterruptSignal::AddAcqRel(hsa_signal_value_t value) { + atomic::Add(&signal_.value, int64_t(value), std::memory_order_acq_rel); + SetEvent(); +} + +void InterruptSignal::SubRelaxed(hsa_signal_value_t value) { + atomic::Sub(&signal_.value, int64_t(value), std::memory_order_relaxed); + SetEvent(); +} + +void InterruptSignal::SubAcquire(hsa_signal_value_t value) { + atomic::Sub(&signal_.value, int64_t(value), std::memory_order_acquire); + SetEvent(); +} + +void InterruptSignal::SubRelease(hsa_signal_value_t value) { + atomic::Sub(&signal_.value, int64_t(value), std::memory_order_release); + SetEvent(); +} + +void InterruptSignal::SubAcqRel(hsa_signal_value_t value) { + atomic::Sub(&signal_.value, int64_t(value), std::memory_order_acq_rel); + SetEvent(); +} + +hsa_signal_value_t InterruptSignal::ExchRelaxed(hsa_signal_value_t value) { + hsa_signal_value_t ret = hsa_signal_value_t(atomic::Exchange( + &signal_.value, int64_t(value), std::memory_order_relaxed)); + SetEvent(); + return ret; +} + +hsa_signal_value_t InterruptSignal::ExchAcquire(hsa_signal_value_t value) { + hsa_signal_value_t ret = hsa_signal_value_t(atomic::Exchange( + &signal_.value, int64_t(value), std::memory_order_acquire)); + SetEvent(); + return ret; +} + +hsa_signal_value_t InterruptSignal::ExchRelease(hsa_signal_value_t value) { + hsa_signal_value_t ret = hsa_signal_value_t(atomic::Exchange( + &signal_.value, int64_t(value), std::memory_order_release)); + SetEvent(); + return ret; +} + +hsa_signal_value_t InterruptSignal::ExchAcqRel(hsa_signal_value_t value) { + hsa_signal_value_t ret = hsa_signal_value_t(atomic::Exchange( + &signal_.value, int64_t(value), std::memory_order_acq_rel)); + SetEvent(); + return ret; +} + +hsa_signal_value_t InterruptSignal::CasRelaxed(hsa_signal_value_t expected, + hsa_signal_value_t value) { + hsa_signal_value_t ret = hsa_signal_value_t( + atomic::Cas(&signal_.value, int64_t(value), int64_t(expected), + std::memory_order_relaxed)); + SetEvent(); + return ret; +} + +hsa_signal_value_t InterruptSignal::CasAcquire(hsa_signal_value_t expected, + hsa_signal_value_t value) { + hsa_signal_value_t ret = hsa_signal_value_t( + atomic::Cas(&signal_.value, int64_t(value), int64_t(expected), + std::memory_order_acquire)); + SetEvent(); + return ret; +} + +hsa_signal_value_t InterruptSignal::CasRelease(hsa_signal_value_t expected, + hsa_signal_value_t value) { + hsa_signal_value_t ret = hsa_signal_value_t( + atomic::Cas(&signal_.value, int64_t(value), int64_t(expected), + std::memory_order_release)); + SetEvent(); + return ret; +} + +hsa_signal_value_t InterruptSignal::CasAcqRel(hsa_signal_value_t expected, + hsa_signal_value_t value) { + hsa_signal_value_t ret = hsa_signal_value_t( + atomic::Cas(&signal_.value, int64_t(value), int64_t(expected), + std::memory_order_acq_rel)); + SetEvent(); + return ret; +} + +} // namespace core diff --git a/runtime/hsa-runtime/core/runtime/isa.cpp b/runtime/hsa-runtime/core/runtime/isa.cpp new file mode 100644 index 0000000000..86891aaef8 --- /dev/null +++ b/runtime/hsa-runtime/core/runtime/isa.cpp @@ -0,0 +1,130 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#include "core/inc/isa.h" + +#include +#include + +namespace core { + +const IsaRegistry::IsaMap IsaRegistry::supported_isas_ = + IsaRegistry::GetSupportedIsas(); + +const Isa *IsaRegistry::GetIsa(const std::string &full_name) { + auto isareg_iter = supported_isas_.find(full_name); + return isareg_iter == supported_isas_.end() ? nullptr : &isareg_iter->second; +} + +const Isa *IsaRegistry::GetIsa(const Isa::Version &version) { + auto isareg_iter = supported_isas_.find(Isa(version).GetFullName()); + return isareg_iter == supported_isas_.end() ? nullptr : &isareg_iter->second; +} + +const IsaRegistry::IsaMap IsaRegistry::GetSupportedIsas() { +#define ISAREG_ENTRY_GEN(maj, min, stp) \ + Isa amd_amdgpu_##maj##min##stp; \ + amd_amdgpu_##maj##min##stp.version_ = Isa::Version(maj, min, stp); \ + supported_isas.insert( \ + std::make_pair( \ + amd_amdgpu_##maj##min##stp.GetFullName(), amd_amdgpu_##maj##min##stp)); \ + + IsaMap supported_isas; + + ISAREG_ENTRY_GEN(7, 0, 0) + ISAREG_ENTRY_GEN(7, 0, 1) + ISAREG_ENTRY_GEN(8, 0, 0) + ISAREG_ENTRY_GEN(8, 0, 1) + ISAREG_ENTRY_GEN(8, 0, 2) + ISAREG_ENTRY_GEN(8, 0, 3) + ISAREG_ENTRY_GEN(8, 1, 0) + ISAREG_ENTRY_GEN(9, 0, 0) + + return supported_isas; +} + +std::string Isa::GetFullName() const { + std::stringstream full_name; + full_name << GetVendor() << ":" << GetArchitecture() << ":" + << GetMajorVersion() << ":" << GetMinorVersion() << ":" + << GetStepping(); + return full_name.str(); +} + +bool Isa::GetInfo(const hsa_isa_info_t &attribute, void *value) const { + if (!value) { + return false; + } + + switch (attribute) { + case HSA_ISA_INFO_NAME_LENGTH: { + std::string full_name = GetFullName(); + *((uint32_t *)value) = static_cast(full_name.size()); + return true; + } + case HSA_ISA_INFO_NAME: { + std::string full_name = GetFullName(); + memcpy(value, full_name.c_str(), full_name.size()); + return true; + } + // @todo: following case needs to be removed + case HSA_ISA_INFO_CALL_CONVENTION_COUNT: { + *((uint32_t *)value) = 1; + return true; + } + // @todo: following case needs to be removed + case HSA_ISA_INFO_CALL_CONVENTION_INFO_WAVEFRONT_SIZE: { + *((uint32_t *)value) = 64; + return true; + } + // @todo: following needs to be removed + case HSA_ISA_INFO_CALL_CONVENTION_INFO_WAVEFRONTS_PER_COMPUTE_UNIT: { + *((uint32_t *)value) = 40; + return true; + } + default: { + return false; + } + } +} + +} // namespace core diff --git a/runtime/hsa-runtime/core/runtime/runtime.cpp b/runtime/hsa-runtime/core/runtime/runtime.cpp new file mode 100644 index 0000000000..8ee17b2911 --- /dev/null +++ b/runtime/hsa-runtime/core/runtime/runtime.cpp @@ -0,0 +1,1010 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#include "core/inc/runtime.h" + +#include +#include +#include +#include +#include +#include + +#include "core/common/shared.h" + +#include "core/inc/hsa_ext_interface.h" +#include "core/inc/amd_cpu_agent.h" +#include "core/inc/amd_gpu_agent.h" +#include "core/inc/amd_memory_region.h" +#include "core/inc/amd_topology.h" +#include "core/inc/signal.h" +#include "core/inc/interrupt_signal.h" + +#include "core/inc/hsa_api_trace_int.h" + +#define HSA_VERSION_MAJOR 1 +#define HSA_VERSION_MINOR 0 + +namespace core { +bool g_use_interrupt_wait = true; + +Runtime* Runtime::runtime_singleton_ = NULL; + +KernelMutex Runtime::bootstrap_lock_; + +static bool loaded = true; + +class RuntimeCleanup { + public: + ~RuntimeCleanup() { + if (!Runtime::IsOpen()) { + delete Runtime::runtime_singleton_; + } + + loaded = false; + } +}; + +static RuntimeCleanup cleanup_at_unload_; + +bool Runtime::Acquire() { + // Check to see if HSA has been cleaned up (process exit) + if (!loaded) return false; + + // Handle initialization races + ScopedAcquire boot(&bootstrap_lock_); + + if (runtime_singleton_ == NULL) { + runtime_singleton_ = new Runtime(); + } + + // Serialize with release + ScopedAcquire lock(&runtime_singleton_->kernel_lock_); + + if (runtime_singleton_->ref_count_ == INT32_MAX) { + return false; + } + + runtime_singleton_->ref_count_++; + + if (runtime_singleton_->ref_count_ == 1) { + runtime_singleton_->Load(); + } + + return true; +} + +bool Runtime::Release() { + ScopedAcquire lock(&kernel_lock_); + if (ref_count_ == 0) { + return false; + } + + if (ref_count_ == 1) { + // Release all registered memory, then unload backends + Unload(); + } + + ref_count_--; + + return true; +} + +bool Runtime::IsOpen() { + return (Runtime::runtime_singleton_ != NULL) && + (Runtime::runtime_singleton_->ref_count_ != 0); +} + +void Runtime::RegisterAgent(Agent* agent) { + if (agent->device_type() == Agent::DeviceType::kAmdCpuDevice) { + cpu_agents_.push_back(agent); + + // Add cpu regions to the system region list. + for (const core::MemoryRegion* region : agent->regions()) { + if (region->fine_grain()) { + system_regions_fine_.push_back(region); + } else { + system_regions_coarse_.push_back(region); + } + } + + assert(system_regions_fine_.size() > 0); + + // Init default fine grain system region allocator using fine grain + // system region of the first discovered CPU agent. + if (cpu_agents_.size() == 1) { + if (system_regions_fine_[0]->full_profile()) { + system_allocator_ = [](size_t size, size_t alignment) -> void * { + return _aligned_malloc(size, alignment); + }; + + system_deallocator_ = [](void* ptr) { _aligned_free(ptr); }; + } else { + // Might need memory pooling to cover allocation that + // requires less than 4096 bytes. + system_allocator_ = [&](size_t size, size_t alignment) -> void * { + assert(alignment <= 4096); + void* ptr = NULL; + return (HSA_STATUS_SUCCESS == + core::Runtime::runtime_singleton_->AllocateMemory( + system_regions_fine_[0], size, &ptr)) + ? ptr + : NULL; + }; + + system_deallocator_ = [](void* ptr) { + core::Runtime::runtime_singleton_->FreeMemory(ptr); + }; + } + + BaseShared::SetAllocateAndFree(system_allocator_, system_deallocator_); + } + + // Setup system clock frequency for the first time. + if (sys_clock_freq_ == 0) { + // Cache system clock frequency + HsaClockCounters clocks; + hsaKmtGetClockCounters(0, &clocks); + sys_clock_freq_ = clocks.SystemClockFrequencyHz; + host_agent_ = agent; + } + } else if (agent->device_type() == Agent::DeviceType::kAmdGpuDevice) { + gpu_agents_.push_back(agent); + + gpu_ids_.push_back(agent->node_id()); + + // Assign the first discovered gpu agent as blit agent that will provide + // DMA operation for hsa_memory_copy. + if (blit_agent_ == NULL) { + blit_agent_ = agent; + + // Query the start and end address of the SVM address space in this + // platform. + if (reinterpret_cast(blit_agent_)->profile() == + HSA_PROFILE_BASE) { + std::vector::const_iterator it = + std::find_if(blit_agent_->regions().begin(), + blit_agent_->regions().end(), + [](const core::MemoryRegion* region) { + return ( + reinterpret_cast(region)->IsSvm()); + }); + + assert(it != blit_agent_->regions().end()); + + const amd::MemoryRegion* svm_region = + reinterpret_cast(*it); + + start_svm_address_ = + static_cast(svm_region->GetBaseAddress()); + end_svm_address_ = start_svm_address_ + svm_region->GetPhysicalSize(); + + // Bind VM fault handler when we detect the first GPU agent. + // TODO(bwicakso): validate if it works on APU. + BindVmFaultHandler(); + } else { + start_svm_address_ = 0; + end_svm_address_ = os::GetUserModeVirtualMemoryBase() + + os::GetUserModeVirtualMemorySize(); + } + } + } +} + +void Runtime::DestroyAgents() { + std::for_each(gpu_agents_.begin(), gpu_agents_.end(), DeleteObject()); + gpu_agents_.clear(); + + gpu_ids_.clear(); + + std::for_each(cpu_agents_.begin(), cpu_agents_.end(), DeleteObject()); + cpu_agents_.clear(); + + blit_agent_ = NULL; + + system_regions_fine_.clear(); + system_regions_coarse_.clear(); +} + +void Runtime::SetLinkCount(size_t num_link) { + const size_t last_index = GetIndexLinkInfo(0, num_link); + link_matrix_.resize(last_index); + + memset(&link_matrix_[0], 0, + link_matrix_.size() * sizeof(hsa_amd_memory_pool_link_info_t)); +} + +void Runtime::RegisterLinkInfo(uint32_t node_id_from, uint32_t node_id_to, + uint32_t num_hop, + hsa_amd_memory_pool_link_info_t& link_info) { + const uint32_t idx = GetIndexLinkInfo(node_id_from, node_id_to); + link_matrix_[idx].num_hop = num_hop; + link_matrix_[idx].info = link_info; +} + +const Runtime::LinkInfo& Runtime::GetLinkInfo(uint32_t node_id_from, + uint32_t node_id_to) { + return link_matrix_[GetIndexLinkInfo(node_id_from, node_id_to)]; +} + +uint32_t Runtime::GetIndexLinkInfo(uint32_t node_id_from, uint32_t node_id_to) { + const uint32_t node_id_max = std::max(node_id_from, node_id_to) - 1; + const uint32_t node_id_min = std::min(node_id_from, node_id_to); + return ((node_id_max * (node_id_max + 1) / 2) + node_id_min); +} + +hsa_status_t Runtime::IterateAgent(hsa_status_t (*callback)(hsa_agent_t agent, + void* data), + void* data) { + if (!IsOpen()) { + return HSA_STATUS_ERROR_NOT_INITIALIZED; + } + + std::vector* agent_lists[2] = {&cpu_agents_, &gpu_agents_}; + for (std::vector* agent_list : agent_lists) { + for (size_t i = 0; i < agent_list->size(); ++i) { + hsa_agent_t agent = Agent::Convert(agent_list->at(i)); + hsa_status_t status = callback(agent, data); + + if (status != HSA_STATUS_SUCCESS) { + return status; + } + } + } + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t Runtime::AllocateMemory(const MemoryRegion* region, size_t size, + void** ptr) { + return AllocateMemory(false, region, size, ptr); +} + +hsa_status_t Runtime::AllocateMemory(bool restrict_access, + const MemoryRegion* region, size_t size, + void** address) { + const amd::MemoryRegion* amd_region = + reinterpret_cast(region); + hsa_status_t status = amd_region->Allocate(restrict_access, size, address); + + // Track the allocation result so that it could be freed properly. + if (status == HSA_STATUS_SUCCESS) { + ScopedAcquire lock(&memory_lock_); + allocation_map_[*address] = AllocationRegion(region, size); + } + + return status; +} + +hsa_status_t Runtime::FreeMemory(void* ptr) { + if (ptr == NULL) { + return HSA_STATUS_SUCCESS; + } + + const MemoryRegion* region = NULL; + size_t size = 0; + { + ScopedAcquire lock(&memory_lock_); + + std::map::const_iterator it = + allocation_map_.find(ptr); + + if (it == allocation_map_.end()) { + assert(false && "Can't find address in allocation map"); + return HSA_STATUS_ERROR; + } + + region = it->second.region; + size = it->second.size; + + allocation_map_.erase(it); + } + + return region->Free(ptr, size); +} + +hsa_status_t Runtime::CopyMemory(void* dst, const void* src, size_t size) { + assert(dst != NULL && src != NULL && size != 0); + + bool is_src_system = false; + bool is_dst_system = false; + const uintptr_t src_uptr = reinterpret_cast(src); + const uintptr_t dst_uptr = reinterpret_cast(dst); + + if ((reinterpret_cast(blit_agent_)->profile() == + HSA_PROFILE_FULL)) { + is_src_system = (src_uptr < end_svm_address_); + is_dst_system = (dst_uptr < end_svm_address_); + } else { + is_src_system = + ((src_uptr < start_svm_address_) || (src_uptr >= end_svm_address_)); + is_dst_system = + ((dst_uptr < start_svm_address_) || (dst_uptr >= end_svm_address_)); + + if ((is_src_system && !is_dst_system) || + (!is_src_system && is_dst_system)) { + // Use staging buffer or pin if either src or dst is gpuvm and the other + // is system memory allocated via OS or C/C++ allocator. + return CopyMemoryHostAlloc(dst, src, size, is_dst_system); + } + } + + if (is_src_system && is_dst_system) { + memmove(dst, src, size); + return HSA_STATUS_SUCCESS; + } + + return blit_agent_->DmaCopy(dst, src, size); +} + +hsa_status_t Runtime::CopyMemoryHostAlloc(void* dst, const void* src, + size_t size, bool dst_malloc) { + void* usrptr = (dst_malloc) ? dst : const_cast(src); + void* agent_ptr = NULL; + + hsa_agent_t blit_agent = core::Agent::Convert(blit_agent_); + + const amd::MemoryRegion* system_region = + reinterpret_cast(system_regions_fine_[0]); + hsa_status_t stat = + system_region->Lock(1, &blit_agent, usrptr, size, &agent_ptr); + + if (stat != HSA_STATUS_SUCCESS) { + return stat; + } + + stat = blit_agent_->DmaCopy((dst_malloc) ? agent_ptr : dst, + (dst_malloc) ? src : agent_ptr, size); + + system_region->Unlock(usrptr); + + return stat; +} + +hsa_status_t Runtime::CopyMemory(void* dst, core::Agent& dst_agent, + const void* src, core::Agent& src_agent, + size_t size, + std::vector& dep_signals, + core::Signal& completion_signal) { + const bool dst_gpu = + (dst_agent.device_type() == core::Agent::DeviceType::kAmdGpuDevice); + const bool src_gpu = + (src_agent.device_type() == core::Agent::DeviceType::kAmdGpuDevice); + if (dst_gpu || src_gpu) { + core::Agent& copy_agent = (src_gpu) ? src_agent : dst_agent; + return copy_agent.DmaCopy(dst, src, size, dep_signals, completion_signal); + } + + // For cpu to cpu, fire and forget a copy thread. + std::thread([](void* dst, const void* src, size_t size, + std::vector dep_signals, + core::Signal* completion_signal) { + for (core::Signal* dep : dep_signals) { + dep->WaitRelaxed(HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX, + HSA_WAIT_STATE_BLOCKED); + } + + memcpy(dst, src, size); + + completion_signal->SubRelease(1); + }, + dst, src, size, dep_signals, &completion_signal).detach(); + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t Runtime::FillMemory(void* ptr, uint32_t value, size_t count) { + assert(blit_agent_ != NULL); + return blit_agent_->DmaFill(ptr, value, count); +} + +hsa_status_t Runtime::AllowAccess(uint32_t num_agents, + const hsa_agent_t* agents, const void* ptr) { + const amd::MemoryRegion* amd_region = NULL; + size_t alloc_size = 0; + + { + ScopedAcquire lock(&memory_lock_); + + std::map::const_iterator it = + allocation_map_.find(ptr); + + if (it == allocation_map_.end()) { + return HSA_STATUS_ERROR; + } + + amd_region = reinterpret_cast(it->second.region); + alloc_size = it->second.size; + } + + return amd_region->AllowAccess(num_agents, agents, ptr, alloc_size); +} + +hsa_status_t Runtime::GetSystemInfo(hsa_system_info_t attribute, void* value) { + switch (attribute) { + case HSA_SYSTEM_INFO_VERSION_MAJOR: + *((uint16_t*)value) = HSA_VERSION_MAJOR; + break; + case HSA_SYSTEM_INFO_VERSION_MINOR: + *((uint16_t*)value) = HSA_VERSION_MINOR; + break; + case HSA_SYSTEM_INFO_TIMESTAMP: { + HsaClockCounters clocks; + hsaKmtGetClockCounters(0, &clocks); + *((uint64_t*)value) = clocks.SystemClockCounter; + break; + } + case HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY: { + assert(sys_clock_freq_ != 0 && + "Use of HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY before HSA " + "initialization completes."); + *(uint64_t*)value = sys_clock_freq_; + break; + } + case HSA_SYSTEM_INFO_SIGNAL_MAX_WAIT: + *((uint64_t*)value) = 0xFFFFFFFFFFFFFFFF; + break; + case HSA_SYSTEM_INFO_ENDIANNESS: +#if defined(HSA_LITTLE_ENDIAN) + *((hsa_endianness_t*)value) = HSA_ENDIANNESS_LITTLE; +#else + *((hsa_endianness_t*)value) = HSA_ENDIANNESS_BIG; +#endif + break; + case HSA_SYSTEM_INFO_MACHINE_MODEL: +#if defined(HSA_LARGE_MODEL) + *((hsa_machine_model_t*)value) = HSA_MACHINE_MODEL_LARGE; +#else + *((hsa_machine_model_t*)value) = HSA_MACHINE_MODEL_SMALL; +#endif + break; + case HSA_SYSTEM_INFO_EXTENSIONS: + memset(value, 0, sizeof(uint8_t) * 128); + + if (extensions_.table.hsa_ext_program_finalize_fn != NULL) { + *((uint8_t*)value) = 1 << HSA_EXTENSION_FINALIZER; + } + + if (extensions_.table.hsa_ext_image_create_fn != NULL) { + *((uint8_t*)value) |= 1 << HSA_EXTENSION_IMAGES; + } + + *((uint8_t*)value) |= 1 << HSA_EXTENSION_AMD_PROFILER; + + break; + default: + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + return HSA_STATUS_SUCCESS; +} + +uint32_t Runtime::GetQueueId() { return atomic::Increment(&queue_count_); } + +hsa_status_t Runtime::SetAsyncSignalHandler(hsa_signal_t signal, + hsa_signal_condition_t cond, + hsa_signal_value_t value, + hsa_amd_signal_handler handler, + void* arg) { + // Asyncronous signal handler is only supported when KFD events are on. + if (!core::g_use_interrupt_wait) return HSA_STATUS_ERROR_OUT_OF_RESOURCES; + + // Indicate that this signal is in use. + if (signal.handle != 0) hsa_signal_handle(signal)->Retain(); + + ScopedAcquire scope_lock(&async_events_control_.lock); + + // Lazy initializer + if (async_events_control_.async_events_thread_ == NULL) { + // Create monitoring thread control signal + auto err = HSA::hsa_signal_create(0, 0, NULL, &async_events_control_.wake); + if (err != HSA_STATUS_SUCCESS) { + assert(false && "Asyncronous events control signal creation error."); + return HSA_STATUS_ERROR_OUT_OF_RESOURCES; + } + async_events_.PushBack(async_events_control_.wake, HSA_SIGNAL_CONDITION_NE, + 0, NULL, NULL); + + // Start event monitoring thread + async_events_control_.exit = false; + async_events_control_.async_events_thread_ = + os::CreateThread(AsyncEventsLoop, NULL); + if (async_events_control_.async_events_thread_ == NULL) { + assert(false && "Asyncronous events thread creation error."); + return HSA_STATUS_ERROR_OUT_OF_RESOURCES; + } + } + + new_async_events_.PushBack(signal, cond, value, handler, arg); + + hsa_signal_handle(async_events_control_.wake)->StoreRelease(1); + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t Runtime::InteropMap(uint32_t num_agents, Agent** agents, + int interop_handle, uint32_t flags, + size_t* size, void** ptr, + size_t* metadata_size, const void** metadata) { + HsaGraphicsResourceInfo info; + + HSAuint32 short_nodes[64]; + HSAuint32* nodes = short_nodes; + if (num_agents > 64) { + nodes = new HSAuint32[num_agents]; + if (nodes == NULL) return HSA_STATUS_ERROR_OUT_OF_RESOURCES; + } + MAKE_SCOPE_GUARD([&]() { + if (num_agents > 64) delete[] nodes; + }); + + for (int i = 0; i < num_agents; i++) + agents[i]->GetInfo((hsa_agent_info_t)HSA_AMD_AGENT_INFO_DRIVER_NODE_ID, + &nodes[i]); + + if (hsaKmtRegisterGraphicsHandleToNodes(interop_handle, &info, num_agents, + nodes) != HSAKMT_STATUS_SUCCESS) + return HSA_STATUS_ERROR; + + HSAuint64 altAddress; + HsaMemMapFlags map_flags; + map_flags.Value = 0; + map_flags.ui32.PageSize = HSA_PAGE_SIZE_64KB; + if (hsaKmtMapMemoryToGPUNodes(info.MemoryAddress, info.SizeInBytes, + &altAddress, map_flags, num_agents, + nodes) != HSAKMT_STATUS_SUCCESS) { + map_flags.ui32.PageSize = HSA_PAGE_SIZE_4KB; + if (hsaKmtMapMemoryToGPUNodes(info.MemoryAddress, info.SizeInBytes, + &altAddress, map_flags, num_agents, + nodes) != HSAKMT_STATUS_SUCCESS) + return HSA_STATUS_ERROR_OUT_OF_RESOURCES; + } + + if (metadata_size != NULL) *metadata_size = info.MetadataSizeInBytes; + if (metadata != NULL) *metadata = info.Metadata; + + *size = info.SizeInBytes; + *ptr = info.MemoryAddress; + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t Runtime::InteropUnmap(void* ptr) +{ + if(hsaKmtUnmapMemoryToGPU(ptr)!=HSAKMT_STATUS_SUCCESS) + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + if(hsaKmtDeregisterMemory(ptr)!=HSAKMT_STATUS_SUCCESS) + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + return HSA_STATUS_SUCCESS; +} + +void Runtime::AsyncEventsLoop(void*) { + auto& async_events_control_ = runtime_singleton_->async_events_control_; + auto& async_events_ = runtime_singleton_->async_events_; + auto& new_async_events_ = runtime_singleton_->new_async_events_; + + while (!async_events_control_.exit) { + // Wait for a signal + hsa_signal_value_t value; + uint32_t index = hsa_amd_signal_wait_any( + uint32_t(async_events_.Size()), &async_events_.signal_[0], + &async_events_.cond_[0], &async_events_.value_[0], uint64_t(-1), + HSA_WAIT_STATE_BLOCKED, &value); + + // Reset the control signal + if (index == 0) { + hsa_signal_handle(async_events_control_.wake)->StoreRelaxed(0); + } else if (index != -1) { + // No error or timout occured, process the handler + assert(async_events_.handler_[index] != NULL); + bool keep = + async_events_.handler_[index](value, async_events_.arg_[index]); + if (!keep) { + hsa_signal_handle(async_events_.signal_[index])->Release(); + async_events_.CopyIndex(index, async_events_.Size() - 1); + async_events_.PopBack(); + } + } + + // Check for dead signals + index = 0; + while (index != async_events_.Size()) { + if (!hsa_signal_handle(async_events_.signal_[index])->IsValid()) { + hsa_signal_handle(async_events_.signal_[index])->Release(); + async_events_.CopyIndex(index, async_events_.Size() - 1); + async_events_.PopBack(); + continue; + } + index++; + } + + // Insert new signals and find plain functions + typedef std::pair func_arg_t; + std::vector functions; + { + ScopedAcquire scope_lock(&async_events_control_.lock); + for (size_t i = 0; i < new_async_events_.Size(); i++) { + if (new_async_events_.signal_[i].handle == 0) { + functions.push_back( + func_arg_t((void (*)(void*))new_async_events_.handler_[i], + new_async_events_.arg_[i])); + continue; + } + async_events_.PushBack( + new_async_events_.signal_[i], new_async_events_.cond_[i], + new_async_events_.value_[i], new_async_events_.handler_[i], + new_async_events_.arg_[i]); + } + new_async_events_.Clear(); + } + + // Call plain functions + for (size_t i = 0; i < functions.size(); i++) + functions[i].first(functions[i].second); + functions.clear(); + } + + // Release wait count of all pending signals + for (size_t i = 1; i < async_events_.Size(); i++) + hsa_signal_handle(async_events_.signal_[i])->Release(); + async_events_.Clear(); + + for (size_t i = 0; i < new_async_events_.Size(); i++) + hsa_signal_handle(new_async_events_.signal_[i])->Release(); + new_async_events_.Clear(); +} + +void Runtime::BindVmFaultHandler() { + if (core::g_use_interrupt_wait) { + // Create memory event with manual reset to avoid racing condition + // with driver in case of multiple concurrent VM faults. + vm_fault_event_ = + core::InterruptSignal::CreateEvent(HSA_EVENTTYPE_MEMORY, true); + + // Create an interrupt signal object to contain the memory event. + // This signal object will be registered with the async handler global + // thread. + vm_fault_signal_ = new core::InterruptSignal(0, vm_fault_event_); + + if (!vm_fault_signal_->IsValid() || vm_fault_signal_->EopEvent() == NULL) { + assert(false && "Failed on creating VM fault signal"); + return; + } + + SetAsyncSignalHandler(core::Signal::Convert(vm_fault_signal_), + HSA_SIGNAL_CONDITION_NE, 0, VMFaultHandler, + reinterpret_cast(vm_fault_signal_)); + } +} + +bool Runtime::VMFaultHandler(hsa_signal_value_t val, void* arg) { + core::InterruptSignal* vm_fault_signal = + reinterpret_cast(arg); + + assert(vm_fault_signal != NULL); + + if (vm_fault_signal == NULL) { + return false; + } + + std::string print_vm_message = os::GetEnvVar("HSA_ENABLE_VM_FAULT_MESSAGE"); + if (print_vm_message == "1") { + HsaEvent* vm_fault_event = vm_fault_signal->EopEvent(); + + const HsaMemoryAccessFault& fault = + vm_fault_event->EventData.EventData.MemoryAccessFault; + + std::string reason = ""; + if (fault.Failure.NotPresent == 1) { + reason += "Page not present or supervisor privilege"; + } else if (fault.Failure.ReadOnly == 1) { + reason += "Write access to a read-only page"; + } else if (fault.Failure.NoExecute == 1) { + reason += "Execute access to a page marked NX"; + } else if (fault.Failure.GpuAccess == 1) { + reason += "Host access only"; + } else if (fault.Failure.ECC == 1) { + reason += "ECC failure (if supported by HW)"; + } + + fprintf(stderr, + "Memory access fault by GPU node-%u on address %p%s. Reason: %s.\n", + fault.NodeId, reinterpret_cast(fault.VirtualAddress), + (fault.Failure.Imprecise == 1) ? "(may not be exact address)" : "", + reason.c_str()); + } else { + assert(false && "GPU memory access fault."); + } + + std::abort(); + + // No need to keep the signal because we are done. + return false; +} + +Runtime::Runtime() + : host_agent_(NULL), + blit_agent_(NULL), + queue_count_(0), + sys_clock_freq_(0), + vm_fault_event_(NULL), + vm_fault_signal_(NULL), + ref_count_(0) { + start_svm_address_ = 0; +#if defined(HSA_LARGE_MODEL) + end_svm_address_ = UINT64_MAX; +#else + end_svm_address_ = UINT32_MAX; +#endif +} + +void Runtime::Load() { + // Load interrupt enable option + std::string interrupt = os::GetEnvVar("HSA_ENABLE_INTERRUPT"); + g_use_interrupt_wait = (interrupt != "0"); + + if (!amd::Load()) { + return; + } + + loader_ = amd::hsa::loader::Loader::Create(&loader_context_); + + // Load extensions + LoadExtensions(); + + // Load tools libraries + LoadTools(); +} + +void Runtime::Unload() { + UnloadTools(); + UnloadExtensions(); + + amd::hsa::loader::Loader::Destroy(loader_); + loader_ = nullptr; + + async_events_control_.Shutdown(); + + delete vm_fault_signal_; + core::InterruptSignal::DestroyEvent(vm_fault_event_); + + DestroyAgents(); + + CloseTools(); + + amd::Unload(); +} + +void Runtime::LoadExtensions() { +// Load finalizer and extension library +#ifdef HSA_LARGE_MODEL + static const std::string kFinalizerLib[] = {"hsa-ext-finalize64.dll", + "libhsa-ext-finalize64.so.1"}; + static const std::string kImageLib[] = {"hsa-ext-image64.dll", + "libhsa-ext-image64.so.1"}; +#else + static const std::string kFinalizerLib[] = {"hsa-ext-finalize.dll", + "libhsa-ext-finalize.so.1"}; + static const std::string kImageLib[] = {"hsa-ext-image.dll", + "libhsa-ext-image.so.1"}; +#endif + extensions_.Load(kFinalizerLib[os_index(os::current_os)]); + extensions_.Load(kImageLib[os_index(os::current_os)]); +} + +void Runtime::UnloadExtensions() { extensions_.Unload(); } + +static std::vector parse_tool_names(std::string tool_names) { + std::vector names; + std::string name = ""; + bool quoted = false; + while (tool_names.size() != 0) { + auto index = tool_names.find_first_of(" \"\\"); + if (index == std::string::npos) { + name += tool_names; + break; + } + switch (tool_names[index]) { + case ' ': { + if (!quoted) { + name += tool_names.substr(0, index); + tool_names.erase(0, index + 1); + names.push_back(name); + name = ""; + } else { + name += tool_names.substr(0, index + 1); + tool_names.erase(0, index + 1); + } + break; + } + case '\"': { + if (quoted) { + quoted = false; + name += tool_names.substr(0, index); + tool_names.erase(0, index + 1); + names.push_back(name); + name = ""; + } else { + quoted = true; + tool_names.erase(0, index + 1); + } + break; + } + case '\\': { + if (tool_names.size() > index + 1) { + name += tool_names.substr(0, index) + tool_names[index + 1]; + tool_names.erase(0, index + 2); + } + break; + } + } // end switch + } // end while + + if (name != "") names.push_back(name); + return names; +} + +void Runtime::LoadTools() { + typedef bool (*tool_init_t)(::ApiTable*, uint64_t, uint64_t, + const char* const*); + typedef Agent* (*tool_wrap_t)(Agent*); + typedef void (*tool_add_t)(Runtime*); + + // Link extensions to API interception + hsa_api_table_.LinkExts(&extensions_.table); + + // Load tool libs + std::string tool_names = os::GetEnvVar("HSA_TOOLS_LIB"); + if (tool_names != "") { + std::vector names = parse_tool_names(tool_names); + std::vector failed; + for (int i = 0; i < names.size(); i++) { + os::LibHandle tool = os::LoadLib(names[i]); + + if (tool != NULL) { + tool_libs_.push_back(tool); + + tool_init_t ld; + ld = (tool_init_t)os::GetExportAddress(tool, "OnLoad"); + if (ld) { + if (!ld(&hsa_api_table_.table, 0, failed.size(), &failed[0])) { + failed.push_back(names[i].c_str()); + os::CloseLib(tool); + continue; + } + } + + tool_wrap_t wrap; + wrap = (tool_wrap_t)os::GetExportAddress(tool, "WrapAgent"); + if (wrap) { + std::vector* agent_lists[2] = {&cpu_agents_, + &gpu_agents_}; + for (std::vector* agent_list : agent_lists) { + for (size_t agent_idx = 0; agent_idx < agent_list->size(); + ++agent_idx) { + Agent* agent = wrap(agent_list->at(agent_idx)); + if (agent != NULL) { + assert(agent->IsValid() && + "Agent returned from WrapAgent is not valid"); + agent_list->at(agent_idx) = agent; + } + } + } + } + + tool_add_t add; + add = (tool_add_t)os::GetExportAddress(tool, "AddAgent"); + if (add) add(this); + } + } + } +} + +void Runtime::UnloadTools() { + typedef void (*tool_unload_t)(); + for (size_t i = tool_libs_.size(); i != 0; i--) { + tool_unload_t unld; + unld = (tool_unload_t)os::GetExportAddress(tool_libs_[i - 1], "OnUnload"); + if (unld) unld(); + } + + // Reset API table in case some tool doesn't cleanup properly + hsa_api_table_.Reset(); +} + +void Runtime::CloseTools() { + // Due to valgrind bug, runtime cannot dlclose extensions see: + // http://valgrind.org/docs/manual/faq.html#faq.unhelpful + if (os::GetEnvVar("HSA_RUNNING_UNDER_VALGRIND") != "1") { + for (int i = 0; i < tool_libs_.size(); i++) os::CloseLib(tool_libs_[i]); + } + tool_libs_.clear(); +} + +void Runtime::AsyncEventsControl::Shutdown() { + if (async_events_thread_ != NULL) { + exit = true; + hsa_signal_handle(wake)->StoreRelaxed(1); + os::WaitForThread(async_events_thread_); + os::CloseThread(async_events_thread_); + async_events_thread_ = NULL; + HSA::hsa_signal_destroy(wake); + } +} + +void Runtime::AsyncEvents::PushBack(hsa_signal_t signal, + hsa_signal_condition_t cond, + hsa_signal_value_t value, + hsa_amd_signal_handler handler, void* arg) { + signal_.push_back(signal); + cond_.push_back(cond); + value_.push_back(value); + handler_.push_back(handler); + arg_.push_back(arg); +} + +void Runtime::AsyncEvents::CopyIndex(size_t dst, size_t src) { + signal_[dst] = signal_[src]; + cond_[dst] = cond_[src]; + value_[dst] = value_[src]; + handler_[dst] = handler_[src]; + arg_[dst] = arg_[src]; +} + +size_t Runtime::AsyncEvents::Size() { return signal_.size(); } + +void Runtime::AsyncEvents::PopBack() { + signal_.pop_back(); + cond_.pop_back(); + value_.pop_back(); + handler_.pop_back(); + arg_.pop_back(); +} + +void Runtime::AsyncEvents::Clear() { + signal_.clear(); + cond_.clear(); + value_.clear(); + handler_.clear(); + arg_.clear(); +} + +} // namespace core diff --git a/runtime/hsa-runtime/core/runtime/signal.cpp b/runtime/hsa-runtime/core/runtime/signal.cpp new file mode 100644 index 0000000000..0b417b60cf --- /dev/null +++ b/runtime/hsa-runtime/core/runtime/signal.cpp @@ -0,0 +1,187 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef HSA_RUNTME_CORE_SIGNAL_CPP_ +#define HSA_RUNTME_CORE_SIGNAL_CPP_ + +#include "core/inc/signal.h" +#include "core/util/timer.h" +#include + +namespace core { + +uint32_t Signal::WaitAny(uint32_t signal_count, hsa_signal_t* hsa_signals, + hsa_signal_condition_t* conds, + hsa_signal_value_t* values, uint64_t timeout, + hsa_wait_state_t wait_hint, + hsa_signal_value_t* satisfying_value) { + hsa_signal_handle* signals = + reinterpret_cast(hsa_signals); + uint32_t prior = 0; + for (uint32_t i = 0; i < signal_count; i++) + prior = Max(prior, atomic::Increment(&signals[i]->waiting_)); + + MAKE_SCOPE_GUARD([&]() { + for (uint32_t i = 0; i < signal_count; i++) + atomic::Decrement(&signals[i]->waiting_); + }); + + // Allow only the first waiter to sleep (temporary, known to be bad). + if (prior != 0) wait_hint = HSA_WAIT_STATE_ACTIVE; + + // Ensure that all signals in the list can be slept on. + if (wait_hint != HSA_WAIT_STATE_ACTIVE) { + for (uint32_t i = 0; i < signal_count; i++) { + if (signals[i]->EopEvent() == NULL) { + wait_hint = HSA_WAIT_STATE_ACTIVE; + break; + } + } + } + + const uint32_t small_size = 10; + HsaEvent* short_evts[small_size]; + HsaEvent** evts = NULL; + uint32_t unique_evts = 0; + if (wait_hint != HSA_WAIT_STATE_ACTIVE) { + if (signal_count > small_size) + evts = new HsaEvent* [signal_count]; + else + evts = short_evts; + for (uint32_t i = 0; i < signal_count; i++) + evts[i] = signals[i]->EopEvent(); + std::sort(evts, evts + signal_count); + HsaEvent** end = std::unique(evts, evts + signal_count); + unique_evts = uint32_t(end - evts); + } + MAKE_SCOPE_GUARD([&]() { + if (signal_count > small_size) delete[] evts; + }); + + int64_t value; + + timer::fast_clock::time_point start_time = timer::fast_clock::now(); + + // Set a polling timeout value + // Exact time is not hugely important, it should just be a short while which + // is smaller than the thread scheduling quantum (usually around 16ms) + const timer::fast_clock::duration kMaxElapsed = std::chrono::milliseconds(5); + + // Convert timeout value into the fast_clock domain + uint64_t hsa_freq; + HSA::hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &hsa_freq); + const timer::fast_clock::duration fast_timeout = + timer::duration_from_seconds( + double(timeout) / double(hsa_freq)); + + bool condition_met = false; + while (true) { + for (uint32_t i = 0; i < signal_count; i++) { + if (signals[i]->invalid_) return uint32_t(-1); + + // Handling special event. + if (signals[i]->EopEvent() != NULL) { + const HSA_EVENTTYPE event_type = + signals[i]->EopEvent()->EventData.EventType; + if (event_type == HSA_EVENTTYPE_MEMORY) { + const HsaMemoryAccessFault& fault = + signals[i]->EopEvent()->EventData.EventData.MemoryAccessFault; + const uint32_t* failure = + reinterpret_cast(&fault.Failure); + if (*failure != 0) { + return i; + } + } + } + + value = + atomic::Load(&signals[i]->signal_.value, std::memory_order_relaxed); + + switch (conds[i]) { + case HSA_SIGNAL_CONDITION_EQ: { + condition_met = (value == values[i]); + break; + } + case HSA_SIGNAL_CONDITION_NE: { + condition_met = (value != values[i]); + break; + } + case HSA_SIGNAL_CONDITION_GTE: { + condition_met = (value >= values[i]); + break; + } + case HSA_SIGNAL_CONDITION_LT: { + condition_met = (value < values[i]); + break; + } + default: + return uint32_t(-1); + } + if (condition_met) { + if (satisfying_value != NULL) *satisfying_value = value; + return i; + } + } + + timer::fast_clock::time_point time = timer::fast_clock::now(); + if (time - start_time > kMaxElapsed) { + if (time - start_time > fast_timeout) { + return uint32_t(-1); + } + if (wait_hint != HSA_WAIT_STATE_ACTIVE) { + uint32_t wait_ms; + auto time_remaining = fast_timeout - (time - start_time); + if ((timeout == -1) || + (time_remaining > std::chrono::milliseconds(uint32_t(-1)))) + wait_ms = uint32_t(-1); + else + wait_ms = timer::duration_cast( + time_remaining).count(); + hsaKmtWaitOnMultipleEvents(evts, unique_evts, false, wait_ms); + } + } + } +} + +} // namespace core + +#endif // header guard diff --git a/runtime/hsa-runtime/core/util/atomic_helpers.h b/runtime/hsa-runtime/core/util/atomic_helpers.h new file mode 100644 index 0000000000..1675c19da8 --- /dev/null +++ b/runtime/hsa-runtime/core/util/atomic_helpers.h @@ -0,0 +1,405 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +// Helpers to use non-atomic types with C++11 atomic operations. + +#ifndef HSA_RUNTIME_CORE_UTIL_ATOMIC_HELPERS_H_ +#define HSA_RUNTIME_CORE_UTIL_ATOMIC_HELPERS_H_ + +#include +#include "utils.h" + +/// @brief: Special assert used here to check each atomic variable for lock free +/// implementation. +/// ANY locked atomics are very likely incompatable with out-of-library +/// concurrent access (HW access for instance) +#define lockless_check(exp) assert(exp) + +namespace atomic { +/// @brief: Checks if type T is compatible with its atomic representation. +/// @param: ptr(Input), a pointer to type T for check. +/// @return: void. +template +static __forceinline void BasicCheck(const T* ptr) { + static_assert(sizeof(T) == sizeof(std::atomic), + "Type is size incompatible with its atomic representation!"); + lockless_check( + reinterpret_cast*>(ptr)->is_lock_free() && + "Atomic operation is not lock free! Use may conflict with peripheral HW " + "atomics!"); +}; + +/// @brief: function overloading, for more info, see previous one. +/// @param: ptr(Input), a pointer to a volatile type. +/// @return: void. +template +static __forceinline void BasicCheck(const volatile T* ptr) { + static_assert(sizeof(T) == sizeof(std::atomic), + "Type is size incompatible with its atomic representation!"); + lockless_check( + reinterpret_cast*>(ptr)->is_lock_free() && + "Atomic operation is not lock free! Use may conflict with peripheral HW " + "atomics!"); +}; + +/// @brief: Load value of type T atomically with specified memory order. +/// @param: ptr(Input), a pointer to type T. +/// @param: order(Input), memory order with atomic load, relaxed by default. +/// @return: T, loaded value. +template +static __forceinline T + Load(const T* ptr, std::memory_order order = std::memory_order_relaxed) { + BasicCheck(ptr); + const std::atomic* aptr = reinterpret_cast*>(ptr); + return aptr->load(order); +} + +/// @brief: function overloading, for more info, see previous one. +/// @param: ptr(Input), a pointer to volatile type T. +/// @param: order(Input), memory order with atomic load, relaxed by default. +/// @return: T, loaded value. +template +static __forceinline T + Load(const volatile T* ptr, + std::memory_order order = std::memory_order_relaxed) { + BasicCheck(ptr); + volatile const std::atomic* aptr = + reinterpret_cast*>(ptr); + return aptr->load(order); +} + +/// @brief: Store value of type T with specified memory order. +/// @param: ptr(Input), a pointer to instance which will be stored. +/// @param: val(Input), value to be stored. +/// @param: order(Input), memory order with atomic store, relaxed by default. +/// @return: void. +template +static __forceinline void Store( + T* ptr, T val, std::memory_order order = std::memory_order_relaxed) { + BasicCheck(ptr); + std::atomic* aptr = reinterpret_cast*>(ptr); + aptr->store(val, order); +} + +/// @brief: Function overloading, for more info, see previous one. +/// @param: ptr(Input), a pointer to volatile instance which will be stored. +/// @param: val(Input), value to be stored. +/// @param: order(Input), memory order with atomic store, relaxed by default. +/// @return: void. +template +static __forceinline void Store( + volatile T* ptr, T val, + std::memory_order order = std::memory_order_relaxed) { + BasicCheck(ptr); + volatile std::atomic* aptr = + reinterpret_cast*>(ptr); + aptr->store(val, order); +} + +/// @brief: Compare and swap value atomically with specified memory order. +/// @param: ptr(Input), a pointer to variable which is operated on. +/// @param: val(Input), value to be stored if condition is satisfied. +/// @param: expected(Input), value which is expected. +/// @param: order(Input), memory order with atomic operation. +/// @return: T, observed value of type T. +template +static __forceinline T + Cas(T* ptr, T val, T expected, + std::memory_order order = std::memory_order_relaxed) { + BasicCheck(ptr); + std::atomic* aptr = reinterpret_cast*>(ptr); + aptr->compare_exchange_strong(expected, val, order); + return expected; +} + +/// @brief: Function overloading, for more info, see previous one. +/// @param: ptr(Input), a pointer to volatile variable which is operated on. +/// @param: val(Input), value to be stored if condition is satisfied. +/// @param: expected(Input), value which is expected. +/// @param: order(Input), memory order which is relaxed by default. +/// @return: T, observed value of type T. +template +static __forceinline T + Cas(volatile T* ptr, T val, T expected, + std::memory_order order = std::memory_order_relaxed) { + BasicCheck(ptr); + volatile std::atomic* aptr = + reinterpret_cast*>(ptr); + aptr->compare_exchange_strong(expected, val, order); + return expected; +} + +/// @brief: Exchange the value atomically with specified memory order. +/// @param: ptr(Input), a pointer to variable which is operated on. +/// @param: val(Input), value to be stored. +/// @param: order(Input), memory order which is relaxed by default. +/// @return: T, the value prior to the exchange. +template +static __forceinline T + Exchange(T* ptr, T val, + std::memory_order order = std::memory_order_relaxed) { + BasicCheck(ptr); + std::atomic* aptr = reinterpret_cast*>(ptr); + return aptr->exchange(val, order); +} + +/// @brief: Function overloading, for more info, see previous one. +/// @param: ptr(Input), a pointer to variable which is operated on. +/// @param: val(Input), value to be stored. +/// @param: order(Input), memory order which is relaxed by default. +/// @return: T, the value prior to the exchange. +template +static __forceinline T + Exchange(volatile T* ptr, T val, + std::memory_order order = std::memory_order_relaxed) { + BasicCheck(ptr); + volatile std::atomic* aptr = + reinterpret_cast*>(ptr); + return aptr->exchange(val, order); +} + +/// @brief: Add value to variable atomically with specified memory order. +/// @param: ptr(Input), a pointer to variable which is operated on. +/// @param: val(Input), value to be added. +/// @param: order(Input), memory order which is relaxed by default. +/// @return: T, the value of the variable prior to the addition. +template +static __forceinline T + Add(T* ptr, T val, std::memory_order order = std::memory_order_relaxed) { + BasicCheck(ptr); + std::atomic* aptr = reinterpret_cast*>(ptr); + return aptr->fetch_add(val, order); +} + +/// @brief: Subtract value from the variable atomically with specified memory +/// order. +/// @param: ptr(Input), a pointer to variable which is operated on. +/// @param: val(Input), value to be subtraced. +/// @param: order(Input), memory order which is relaxed by default. +/// @return: T, value of the variable prior to the subtraction. +template +static __forceinline T + Sub(T* ptr, T val, std::memory_order order = std::memory_order_relaxed) { + BasicCheck(ptr); + std::atomic* aptr = reinterpret_cast*>(ptr); + return aptr->fetch_sub(val, order); +} + +/// @brief: Bit And operation on variable atomically with specified memory +/// order. +/// @param: ptr(Input), a pointer to variable which is operated on. +/// @param: val(Input), value which is ANDed with variable. +/// @param: order(Input), memory order which is relaxed by default. +/// @return: T, value of variable prior to the operation. +template +static __forceinline T + And(T* ptr, T val, std::memory_order order = std::memory_order_relaxed) { + BasicCheck(ptr); + std::atomic* aptr = reinterpret_cast*>(ptr); + return aptr->fetch_and(val, order); +} + +/// @brief: Bit Or operation on variable atomically with specified memory order. +/// @param: ptr(Input), a pointer to variable which is operated on. +/// @param: val(Input), value which is ORed with variable. +/// @param: order(Input), memory order which is relaxed by default. +/// @return: T, value of variable prior to the operation. +template +static __forceinline T + Or(T* ptr, T val, std::memory_order order = std::memory_order_relaxed) { + BasicCheck(ptr); + std::atomic* aptr = reinterpret_cast*>(ptr); + return aptr->fetch_or(val, order); +} + +/// @brief: Bit Xor operation on variable atomically with specified memory +/// order. +/// @param: ptr(Input), a pointer to variable which is operated on. +/// @param: val(Input), value which is XORed with variable. +/// @order: order(Input), memory order which is relaxed by default. +/// @return: T, valud of variable prior to the opertaion. +template +static __forceinline T + Xor(T* ptr, T val, std::memory_order order = std::memory_order_relaxed) { + BasicCheck(ptr); + std::atomic* aptr = reinterpret_cast*>(ptr); + return aptr->fetch_xor(val, order); +} + +/// @brief: Increase the value of variable atomically with specified memory +/// order. +/// @param: ptr(Input), a pointer to variable which is operated on. +/// @param: order(Input), memory order which is relaxed by default. +/// @return: T, value of variable prior to the operation. +template +static __forceinline T + Increment(T* ptr, std::memory_order order = std::memory_order_relaxed) { + BasicCheck(ptr); + std::atomic* aptr = reinterpret_cast*>(ptr); + return aptr->fetch_add(1, order); +} + +/// @brief: Decrease the value of the variable atomically with specified memory +/// order. +/// @param: ptr(Input), a pointer to variable which is operated on. +/// @param: order(Input), memory order which is relaxed by default. +/// @return: T, value of variable prior to the operation. +template +static __forceinline T + Decrement(T* ptr, std::memory_order order = std::memory_order_relaxed) { + BasicCheck(ptr); + std::atomic* aptr = reinterpret_cast*>(ptr); + return aptr->fetch_sub(1, order); +} + +/// @brief: Add value to variable atomically with specified memory order. +/// @param: ptr(Input), a pointer to volatile variable which is operated on. +/// @param: val(Input), value to be added. +/// @param: order(Input), memory order which is relaxed by default. +/// @return: T, the value of the variable prior to the addition. +template +static __forceinline T + Add(volatile T* ptr, T val, + std::memory_order order = std::memory_order_relaxed) { + BasicCheck(ptr); + volatile std::atomic* aptr = + reinterpret_cast*>(ptr); + return aptr->fetch_add(val, order); +} + +/// @brief: Subtract value from the variable atomically with specified memory +/// order. +/// @param: ptr(Input), a pointer to volatile variable which is operated on. +/// @param: val(Input), value to be subtraced. +/// @param: order(Input), memory order which is relaxed by default. +/// @return: T, value of the variable prior to the subtraction. +template +static __forceinline T + Sub(volatile T* ptr, T val, + std::memory_order order = std::memory_order_relaxed) { + BasicCheck(ptr); + volatile std::atomic* aptr = + reinterpret_cast*>(ptr); + return aptr->fetch_sub(val, order); +} + +/// @brief: Bit And operation on variable atomically with specified memory +/// order. +/// @param: ptr(Input), a pointer to volatile variable which is operated on. +/// @param: val(Input), value which is ANDed with variable. +/// @param: order(Input), memory order which is relaxed by default. +/// @return: T, value of variable prior to the operation. +template +static __forceinline T + And(volatile T* ptr, T val, + std::memory_order order = std::memory_order_relaxed) { + BasicCheck(ptr); + volatile std::atomic* aptr = + reinterpret_cast*>(ptr); + return aptr->fetch_and(val, order); +} + +/// @brief: Bit Or operation on variable atomically with specified memory order. +/// @param: ptr(Input), a pointer to volatile variable which is operated on. +/// @param: val(Input), value which is ORed with variable. +/// @param: order(Input), memory order which is relaxed by default. +/// @return: T, value of variable prior to the operation. +template +static __forceinline T Or(volatile T* ptr, T val, + std::memory_order order = std::memory_order_relaxed) { + BasicCheck(ptr); + volatile std::atomic* aptr = + reinterpret_cast*>(ptr); + return aptr->fetch_or(val, order); +} + +/// @brief: Bit Xor operation on variable atomically with specified memory +/// order. +/// @param: ptr(Input), a pointer to volatile variable which is operated on. +/// @param: val(Input), value which is XORed with variable. +/// @order: order(Input), memory order which is relaxed by default. +/// @return: T, valud of variable prior to the opertaion. +template +static __forceinline T + Xor(volatile T* ptr, T val, + std::memory_order order = std::memory_order_relaxed) { + BasicCheck(ptr); + volatile std::atomic* aptr = + reinterpret_cast*>(ptr); + return aptr->fetch_xor(val, order); +} + +/// @brief: Increase the value of variable atomically with specified memory +/// order. +/// @param: ptr(Input), a pointer to volatile variable which is operated on. +/// @param: order(Input), memory order which is relaxed by default. +/// @return: T, value of variable prior to the operation. +template +static __forceinline T + Increment(volatile T* ptr, + std::memory_order order = std::memory_order_relaxed) { + BasicCheck(ptr); + volatile std::atomic* aptr = + reinterpret_cast*>(ptr); + return aptr->fetch_add(1, order); +} + +/// @brief: Decrease the value of the variable atomically with specified memory +/// order. +/// @param: ptr(Input), a pointer to volatile variable which is operated on. +/// @param: order(Input), memory order which is relaxed by default. +/// @return: T, value of variable prior to the operation. +template +static __forceinline T + Decrement(volatile T* ptr, + std::memory_order order = std::memory_order_relaxed) { + BasicCheck(ptr); + volatile std::atomic* aptr = + reinterpret_cast*>(ptr); + return aptr->fetch_sub(1, order); +} +} + +// Remove special assert to avoid name polution +#undef lockless_check + +#endif // HSA_RUNTIME_CORE_UTIL_ATOMIC_HELPERS_H_ diff --git a/runtime/hsa-runtime/core/util/lnx/os_linux.cpp b/runtime/hsa-runtime/core/util/lnx/os_linux.cpp new file mode 100644 index 0000000000..c83c765144 --- /dev/null +++ b/runtime/hsa-runtime/core/util/lnx/os_linux.cpp @@ -0,0 +1,344 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifdef __linux__ +#include "core/util/os.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace os { + +static_assert(sizeof(LibHandle) == sizeof(void*), + "OS abstraction size mismatch"); +static_assert(sizeof(Mutex) == sizeof(pthread_mutex_t*), + "OS abstraction size mismatch"); +static_assert(sizeof(Thread) == sizeof(pthread_t), + "OS abstraction size mismatch"); + +LibHandle LoadLib(std::string filename) { + void* ret = dlopen(filename.c_str(), RTLD_LAZY); + return *(LibHandle*)&ret; +} + +void* GetExportAddress(LibHandle lib, std::string export_name) { + void* ret = dlsym(*(void**)&lib, export_name.c_str()); + + // dlsym searches the given library and all the library's load dependencies. + // Remaining code limits symbol lookup to only the library handle given. + // This lookup pattern matches Windows. + if (ret == NULL) return ret; + + link_map* map; + int err = dlinfo(*(void**)&lib, RTLD_DI_LINKMAP, &map); + assert(err != -1 && "dlinfo failed."); + + Dl_info info; + err = dladdr(ret, &info); + assert(err != 0 && "dladdr failed."); + + if (strcmp(info.dli_fname, map->l_name) == 0) return ret; + + return NULL; +} + +void CloseLib(LibHandle lib) { dlclose(*(void**)&lib); } + +Mutex CreateMutex() { + pthread_mutex_t* mutex = new pthread_mutex_t; + pthread_mutex_init(mutex, NULL); + return *(Mutex*)&mutex; +} + +bool TryAcquireMutex(Mutex lock) { + return pthread_mutex_trylock(*(pthread_mutex_t**)&lock) == 0; +} + +bool AcquireMutex(Mutex lock) { + return pthread_mutex_lock(*(pthread_mutex_t**)&lock) == 0; +} + +void ReleaseMutex(Mutex lock) { + pthread_mutex_unlock(*(pthread_mutex_t**)&lock); +} + +void DestroyMutex(Mutex lock) { + pthread_mutex_destroy(*(pthread_mutex_t**)&lock); + delete *(pthread_mutex_t**)&lock; +} + +void Sleep(int delay_in_millisec) { usleep(delay_in_millisec * 1000); } + +void YieldThread() { sched_yield(); } + +struct ThreadArgs { + void* entry_args; + ThreadEntry entry_function; +}; + +void* __stdcall ThreadTrampoline(void* arg) { + ThreadArgs* ar = (ThreadArgs*)arg; + ThreadEntry CallMe = ar->entry_function; + void* Data = ar->entry_args; + delete ar; + CallMe(Data); + return NULL; +} + +Thread CreateThread(ThreadEntry function, void* threadArgument, + uint stackSize) { + ThreadArgs* args = new ThreadArgs; + args->entry_args = threadArgument; + args->entry_function = function; + pthread_t thread; + pthread_attr_t attrib; + pthread_attr_init(&attrib); + if (stackSize != 0) pthread_attr_setstacksize(&attrib, stackSize); + bool success = + (pthread_create(&thread, &attrib, ThreadTrampoline, args) == 0); + pthread_attr_destroy(&attrib); + if (!success) { + pthread_join(thread, NULL); + return NULL; + } + return *(Thread*)&thread; +} + +void CloseThread(Thread thread) { pthread_detach(*(pthread_t*)&thread); } + +bool WaitForThread(Thread thread) { + return pthread_join(*(pthread_t*)&thread, NULL); +} + +bool WaitForAllThreads(Thread* threads, uint threadCount) { + for (uint i = 0; i < threadCount; i++) WaitForThread(threads[i]); + return true; +} + +void SetEnvVar(std::string env_var_name, std::string env_var_value) { + setenv(env_var_name.c_str(), env_var_value.c_str(), 1); +} + +std::string GetEnvVar(std::string env_var_name) { + char* buff; + buff = getenv(env_var_name.c_str()); + std::string ret; + if (buff) { + ret = buff; + } + return ret; +} + +size_t GetUserModeVirtualMemorySize() { +#ifdef _LP64 + // https://www.kernel.org/doc/Documentation/x86/x86_64/mm.txt : + // user space is 0000000000000000 - 00007fffffffffff (=47 bits) + return (size_t)(0x800000000000); +#else + return (size_t)(0xffffffff); // ~4GB +#endif +} + +size_t GetUsablePhysicalHostMemorySize() { + struct sysinfo info = {0}; + if (sysinfo(&info) != 0) { + return 0; + } + + const size_t physical_size = + static_cast(info.totalram * info.mem_unit); + return std::min(GetUserModeVirtualMemorySize(), physical_size); +} + +uintptr_t GetUserModeVirtualMemoryBase() { return (uintptr_t)0; } + +// Os event implementation +typedef struct EventDescriptor_ { + pthread_cond_t event; + pthread_mutex_t mutex; + bool state; + bool auto_reset; +} EventDescriptor; + +EventHandle CreateOsEvent(bool auto_reset, bool init_state) { + EventDescriptor* eventDescrp; + eventDescrp = (EventDescriptor*)malloc(sizeof(EventDescriptor)); + + pthread_mutex_init(&eventDescrp->mutex, NULL); + pthread_cond_init(&eventDescrp->event, NULL); + eventDescrp->auto_reset = auto_reset; + eventDescrp->state = init_state; + + EventHandle handle = reinterpret_cast(eventDescrp); + + return handle; +} + +int DestroyOsEvent(EventHandle event) { + if (event == NULL) { + return -1; + } + + EventDescriptor* eventDescrp = reinterpret_cast(event); + int ret_code = pthread_cond_destroy(&eventDescrp->event); + ret_code |= pthread_mutex_destroy(&eventDescrp->mutex); + free(eventDescrp); + return ret_code; +} + +int WaitForOsEvent(EventHandle event, unsigned int milli_seconds) { + if (event == NULL) { + return -1; + } + + EventDescriptor* eventDescrp = reinterpret_cast(event); + // Event wait time is 0 and state is non-signaled, return directly + if (milli_seconds == 0) { + int tmp_ret = pthread_mutex_trylock(&eventDescrp->mutex); + if (tmp_ret == EBUSY) { + // Timeout + return 1; + } + } + + int ret_code = 0; + pthread_mutex_lock(&eventDescrp->mutex); + if (!eventDescrp->state) { + if (milli_seconds == 0) { + ret_code = 1; + } else { + struct timespec ts; + struct timeval tp; + + ret_code = gettimeofday(&tp, NULL); + ts.tv_sec = tp.tv_sec; + ts.tv_nsec = tp.tv_usec * 1000; + + unsigned int sec = milli_seconds / 1000; + unsigned int mSec = milli_seconds % 1000; + + ts.tv_sec += sec; + ts.tv_nsec += mSec * 1000000; + + // More then one second, add 1 sec to the tv_sec elem + if (ts.tv_nsec > 1000000000) { + ts.tv_sec += 1; + ts.tv_nsec = ts.tv_nsec - 1000000000; + } + + ret_code = + pthread_cond_timedwait(&eventDescrp->event, &eventDescrp->mutex, &ts); + // Time out + if (ret_code == 110) { + ret_code = 0x14003; // 1 means time out in HSA + } + + if (ret_code == 0 && eventDescrp->auto_reset) { + eventDescrp->state = false; + } + } + } else if (eventDescrp->auto_reset) { + eventDescrp->state = false; + } + pthread_mutex_unlock(&eventDescrp->mutex); + + return ret_code; +} + +int SetOsEvent(EventHandle event) { + if (event == NULL) { + return -1; + } + + EventDescriptor* eventDescrp = reinterpret_cast(event); + int ret_code = 0; + ret_code = pthread_mutex_lock(&eventDescrp->mutex); + eventDescrp->state = true; + ret_code = pthread_mutex_unlock(&eventDescrp->mutex); + ret_code |= pthread_cond_signal(&eventDescrp->event); + + return ret_code; +} + +int ResetOsEvent(EventHandle event) { + if (event == NULL) { + return -1; + } + + EventDescriptor* eventDescrp = reinterpret_cast(event); + int ret_code = 0; + ret_code = pthread_mutex_lock(&eventDescrp->mutex); + eventDescrp->state = false; + ret_code = pthread_mutex_unlock(&eventDescrp->mutex); + + return ret_code; +} + +uint64_t ReadAccurateClock() { + timespec time; + int err = clock_gettime(CLOCK_MONOTONIC_RAW, &time); + assert(err == 0 && "clock_gettime(CLOCK_MONOTONIC_RAW,...) failed"); + return uint64_t(time.tv_sec) * 1000000000ull + uint64_t(time.tv_nsec); +} + +uint64_t AccurateClockFrequency() { + timespec time; + int err = clock_getres(CLOCK_MONOTONIC_RAW, &time); + assert(err == 0 && "clock_getres(CLOCK_MONOTONIC_RAW,...) failed"); + assert(time.tv_sec == 0 && + "clock_getres(CLOCK_MONOTONIC_RAW,...) returned very low frequency " + "(<1Hz)."); + assert(time.tv_nsec < 0xFFFFFFFF && + "clock_getres(CLOCK_MONOTONIC_RAW,...) returned very low frequency " + "(<1Hz)."); + return uint64_t(time.tv_nsec) * 1000000000ull; +} +} + +#endif diff --git a/runtime/hsa-runtime/core/util/locks.h b/runtime/hsa-runtime/core/util/locks.h new file mode 100644 index 0000000000..6ea35f5685 --- /dev/null +++ b/runtime/hsa-runtime/core/util/locks.h @@ -0,0 +1,136 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +// Library of syncronization primitives - to be added to as needed. + +#ifndef HSA_RUNTIME_CORE_UTIL_LOCKS_H_ +#define HSA_RUNTIME_CORE_UTIL_LOCKS_H_ + +#include "utils.h" +#include "os.h" + +/// @brief: A class behaves as a lock in a scope. When trying to enter into the +/// critical section, creat a object of this class. After the control path goes +/// out of the scope, it will release the lock automatically. +template +class ScopedAcquire { + public: + /// @brief: When constructing, acquire the lock. + /// @param: lock(Input), pointer to an existing lock. + explicit ScopedAcquire(LockType* lock) : lock_(lock) { lock_->Acquire(); } + + /// @brief: when destructing, release the lock. + ~ScopedAcquire() { lock_->Release(); } + + private: + LockType* lock_; + /// @brief: Disable copiable and assignable ability. + DISALLOW_COPY_AND_ASSIGN(ScopedAcquire); +}; + +/// @brief: a class represents a kernel mutex. +/// Uses the kernel's scheduler to keep the waiting thread from being scheduled +/// until the lock is released (Best for long waits, though anything using +/// a kernel object is a long wait). +class KernelMutex { + public: + KernelMutex() { lock_ = os::CreateMutex(); } + ~KernelMutex() { os::DestroyMutex(lock_); } + + bool Try() { return os::TryAcquireMutex(lock_); } + bool Acquire() { return os::AcquireMutex(lock_); } + void Release() { os::ReleaseMutex(lock_); } + + private: + os::Mutex lock_; + + /// @brief: Disable copiable and assignable ability. + DISALLOW_COPY_AND_ASSIGN(KernelMutex); +}; + +/// @brief: represents a spin lock. +/// For very short hold durations on the order of the thread scheduling +/// quanta or less. +class SpinMutex { + public: + SpinMutex() { lock_ = 0; } + + bool Try() { + int old = 0; + return lock_.compare_exchange_strong(old, 1); + } + bool Acquire() { + int old = 0; + while (!lock_.compare_exchange_strong(old, 1)) + { + old=0; + os::YieldThread(); + } + return true; + } + void Release() { lock_ = 0; } + + private: + std::atomic lock_; + + /// @brief: Disable copiable and assignable ability. + DISALLOW_COPY_AND_ASSIGN(SpinMutex); +}; + +class KernelEvent { + public: + KernelEvent() { evt_ = os::CreateOsEvent(true, true); } + ~KernelEvent() { os::DestroyOsEvent(evt_); } + + bool IsSet() { return os::WaitForOsEvent(evt_, 0)==0; } + bool WaitForSet() { return os::WaitForOsEvent(evt_, 0xFFFFFFFF)==0; } + void Set() { os::SetOsEvent(evt_); } + void Reset() { os::ResetOsEvent(evt_); } + + private: + os::EventHandle evt_; + + /// @brief: Disable copiable and assignable ability. + DISALLOW_COPY_AND_ASSIGN(KernelEvent); +}; + +#endif // HSA_RUNTIME_CORE_SUTIL_LOCKS_H_ diff --git a/runtime/hsa-runtime/core/util/os.h b/runtime/hsa-runtime/core/util/os.h new file mode 100644 index 0000000000..c3936e32da --- /dev/null +++ b/runtime/hsa-runtime/core/util/os.h @@ -0,0 +1,216 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +// Minimal operating system abstraction interfaces. + +#ifndef HSA_RUNTIME_CORE_UTIL_OS_H_ +#define HSA_RUNTIME_CORE_UTIL_OS_H_ + +#include +#include "utils.h" + +namespace os { +typedef void* LibHandle; +typedef void* Mutex; +typedef void* Thread; +typedef void* EventHandle; + +enum class os_t { OS_WIN = 0, OS_LINUX, COUNT }; +static __forceinline std::underlying_type::type os_index(os_t val) { + return std::underlying_type::type(val); +} + +#ifdef _WIN32 +static const os_t current_os = os_t::OS_WIN; +#elif __linux__ +static const os_t current_os = os_t::OS_LINUX; +#else +static_assert(false, "Operating System not detected!"); +#endif + +/// @brief: Loads dynamic library based on file name. Return value will be NULL +/// if failed. +/// @param: filename(Input), file name of the library. +/// @return: LibHandle. +LibHandle LoadLib(std::string filename); + +/// @brief: Gets the address of exported symbol. Return NULl if failed. +/// @param: lib(Input), library handle which exporting from. +/// @param: export_name(Input), the name of the exported symbol. +/// @return: void*. +void* GetExportAddress(LibHandle lib, std::string export_name); + +/// @brief: Unloads the dynamic library. +/// @param: lib(Input), library handle which will be unloaded. +void CloseLib(LibHandle lib); + +/// @brief: Creates a mutex, will return NULL if failed. +/// @param: void. +/// @return: Mutex. +Mutex CreateMutex(); + +/// @brief: Tries to acquire the mutex once, if successed, return true. +/// @param: lock(Input), handle to the mutex. +/// @return: bool. +bool TryAcquireMutex(Mutex lock); + +/// @brief: Aquires the mutex, if the mutex is locked, it will wait until it is +/// released. If the mutex is acquired successfully, it will return true. +/// @param: lock(Input), handle to the mutex. +/// @return: bool. +bool AcquireMutex(Mutex lock); + +/// @brief: Releases the mutex. +/// @param: lock(Input), handle to the mutex. +/// @return: void. +void ReleaseMutex(Mutex lock); + +/// @brief: Destroys the mutex. +/// @param: lock(Input), handle to the mutex. +/// @return: void. +void DestroyMutex(Mutex lock); + +/// @brief: Puts current thread to sleep. +/// @param: delayInMs(Input), time in millisecond for sleeping. +/// @return: void. +void Sleep(int delayInMs); + +/// @brief: Yields current thread. +/// @param: void. +/// @return: void. +void YieldThread(); + +typedef void (*ThreadEntry)(void*); + +/// @brief: Creates a thread will return NULL if failed. +/// @param: entry_function(Input), a pointer to the function which the thread +/// starts from. +/// @param: entry_argument(Input), a pointer to the argument of the thread +/// function. +/// @param: stack_size(Input), size of the thread's stack, 0 by default. +/// @return: Thread, a handle to thread created. +Thread CreateThread(ThreadEntry entry_function, void* entry_argument, + uint stack_size = 0); + +/// @brief: Destroys the thread. +/// @param: thread(Input), thread handle to what will be destroyed. +/// @return: void. +void CloseThread(Thread thread); + +/// @brief: Waits for specific thread to finish, if successed, return true. +/// @param: thread(Input), handle to waiting thread. +/// @return: bool. +bool WaitForThread(Thread thread); + +/// @brief: Waits for multiple threads to finish, if successed, return ture. +/// @param; threads(Input), a pointer to a list of thread handle. +/// @param: thread_count(Input), number of threads to be waited on. +/// @return: bool. +bool WaitForAllThreads(Thread* threads, uint thread_count); + +/// @brief: Sets the environment value. +/// @param: env_var_name(Input), name of the environment value. +/// @param: env_var_value(Input), value of the environment value.s +/// @return: void. +void SetEnvVar(std::string env_var_name, std::string env_var_value); + +/// @brief: Gets the value of environment value. +/// @param: env_var_name(Input), name of the environment value. +/// @return: std::string, value of the environment value, returned as string. +std::string GetEnvVar(std::string env_var_name); + +/// @brief: Gets the max virtual memory size accessible to the application. +/// @param: void. +/// @return: size_t, size of the accessible memory to the application. +size_t GetUserModeVirtualMemorySize(); + +/// @brief: Gets the max physical host system memory size. +/// @param: void. +/// @return: size_t, size of the physical host system memory. +size_t GetUsablePhysicalHostMemorySize(); + +/// @brief: Gets the virtual memory base address. It is hardcoded to 0. +/// @param: void. +/// @return: uintptr_t, always 0. +uintptr_t GetUserModeVirtualMemoryBase(); + +/// @brief os event api, create an event +/// @param: auto_reset whether an event can reset the status automatically +/// @param: init_state initial state of the event +/// @return: event handle +EventHandle CreateOsEvent(bool auto_reset, bool init_state); + +/// @brief os event api, destroy an event +/// @param: event handle +/// @return: whether destroy is correct +int DestroyOsEvent(EventHandle event); + +/// @brief os event api, wait on event +/// @param: event Event handle +/// @param: milli_seconds wait time +/// @return: Indicate success or timeout +int WaitForOsEvent(EventHandle event, unsigned int milli_seconds); + +/// @brief os event api, set event state +/// @param: event Event handle +/// @return: Whether event set is correct +int SetOsEvent(EventHandle event); + +/// @brief os event api, reset event state +/// @param: event Event handle +/// @return: Whether event reset is correct +int ResetOsEvent(EventHandle event); + +/// @brief reads a clock which is deemed to be accurate for elapsed time +/// measurements, though not necessarilly fast to query +/// @return clock counter value +uint64_t ReadAccurateClock(); + +/// @brief retrieves the frequency in Hz of the unit used in ReadAccurateClock. +/// It does not necessarilly reflect the resolution of the clock, but is the +/// value needed to convert a difference in the clock's counter value to elapsed +/// seconds. This frequency does not change at runtime. +/// @return returns the frequency +uint64_t AccurateClockFrequency(); +} + +#endif // HSA_RUNTIME_CORE_UTIL_OS_H_ diff --git a/runtime/hsa-runtime/core/util/small_heap.cpp b/runtime/hsa-runtime/core/util/small_heap.cpp new file mode 100644 index 0000000000..82e4909360 --- /dev/null +++ b/runtime/hsa-runtime/core/util/small_heap.cpp @@ -0,0 +1,174 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#include "small_heap.h" + +SmallHeap::memory_t::iterator SmallHeap::merge( + SmallHeap::memory_t::iterator& keep, + SmallHeap::memory_t::iterator& destroy) { + assert((char*)keep->first + keep->second.len == (char*)destroy->first && + "Invalid merge"); + assert(keep->second.isfree() && "Merge with allocated block"); + assert(destroy->second.isfree() && "Merge with allocated block"); + + keep->second.len += destroy->second.len; + keep->second.next_free = destroy->second.next_free; + if (!destroy->second.islastfree()) + memory[destroy->second.next_free].prior_free = keep->first; + + memory.erase(destroy); + return keep; +} + +void SmallHeap::free(void* ptr) { + if (ptr == NULL) return; + + auto iterator = memory.find(ptr); + + // Check for illegal free + if (iterator == memory.end()) { + assert(false && "Illegal free."); + return; + } + + const auto start_guard = memory.find(0); + const auto end_guard = memory.find((void*)0xFFFFFFFFFFFFFFFFull); + + // Return memory to total and link node into free list + total_free += iterator->second.len; + if (first_free < iterator->first) { + auto before = iterator; + before--; + while (before != start_guard && !before->second.isfree()) before--; + assert(before->second.next_free > iterator->first && + "Inconsistency in small heap."); + iterator->second.prior_free = before->first; + iterator->second.next_free = before->second.next_free; + before->second.next_free = iterator->first; + if (!iterator->second.islastfree()) + memory[iterator->second.next_free].prior_free = iterator->first; + } else { + iterator->second.setfirstfree(); + iterator->second.next_free = first_free; + first_free = iterator->first; + if (!iterator->second.islastfree()) + memory[iterator->second.next_free].prior_free = iterator->first; + } + + // Attempt compaction + auto before = iterator; + before--; + if (before != start_guard) { + if (before->second.isfree()) { + iterator = merge(before, iterator); + } + } + + auto after = iterator; + after++; + if (after != end_guard) { + if (after->second.isfree()) { + iterator = merge(iterator, after); + } + } +} + +void* SmallHeap::alloc(size_t bytes) { + // Is enough memory available? + if ((bytes > total_free) || (bytes == 0)) return NULL; + + memory_t::iterator current; + memory_t::iterator prior; + + // Walk the free list and allocate at first fitting location + prior = current = memory.find(first_free); + while (true) { + if (bytes <= current->second.len) { + // Decrement from total + total_free -= bytes; + + // Is allocation an exact fit? + if (bytes == current->second.len) { + if (prior == current) { + first_free = current->second.next_free; + if (!current->second.islastfree()) + memory[current->second.next_free].setfirstfree(); + } else { + prior->second.next_free = current->second.next_free; + if (!current->second.islastfree()) + memory[current->second.next_free].prior_free = prior->first; + } + current->second.next_free = NULL; + return current->first; + } else { + // Split current node + void* remaining = (char*)current->first + bytes; + Node& node = memory[remaining]; + node.next_free = current->second.next_free; + node.prior_free = current->second.prior_free; + node.len = current->second.len - bytes; + current->second.len = bytes; + + if (prior == current) { + first_free = remaining; + node.setfirstfree(); + } else { + prior->second.next_free = remaining; + node.prior_free = prior->first; + } + if (!node.islastfree()) memory[node.next_free].prior_free = remaining; + + current->second.next_free = NULL; + return current->first; + } + } + + // End of free list? + if (current->second.islastfree()) break; + + prior = current; + current = memory.find(current->second.next_free); + } + + // Can't service the request due to fragmentation + return NULL; +} diff --git a/runtime/hsa-runtime/core/util/small_heap.h b/runtime/hsa-runtime/core/util/small_heap.h new file mode 100644 index 0000000000..0da5ac280f --- /dev/null +++ b/runtime/hsa-runtime/core/util/small_heap.h @@ -0,0 +1,114 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +// A simple first fit memory allocator with eager compaction. For use with few +// items (where list iteration is faster than trees). +// Not thread safe! + +#ifndef HSA_RUNTME_CORE_UTIL_SMALL_HEAP_H_ +#define HSA_RUNTME_CORE_UTIL_SMALL_HEAP_H_ + +#include "utils.h" + +#include + +class SmallHeap { + public: + class Node { + public: + size_t len; + void* next_free; + void* prior_free; + static const intptr_t END = -1; + + __forceinline bool isfree() const { return next_free != NULL; } + __forceinline bool islastfree() const { return intptr_t(next_free) == END; } + __forceinline bool isfirstfree() const { + return intptr_t(prior_free) == END; + } + __forceinline void setlastfree() { + *reinterpret_cast(&next_free) = END; + } + __forceinline void setfirstfree() { + *reinterpret_cast(&prior_free) = END; + } + }; + + private: + SmallHeap(const SmallHeap& rhs); + SmallHeap& operator=(const SmallHeap& rhs); + + void* const pool; + const size_t length; + + size_t total_free; + void* first_free; + std::map memory; + + typedef decltype(memory) memory_t; + memory_t::iterator merge(memory_t::iterator& keep, + memory_t::iterator& destroy); + + public: + SmallHeap() : pool(NULL), length(0), total_free(0) {} + SmallHeap(void* base, size_t length) + : pool(base), length(length), total_free(length) { + first_free = pool; + + Node& node = memory[first_free]; + node.len = length; + node.setlastfree(); + node.setfirstfree(); + + memory[0].len = 0; + memory[(void*)0xFFFFFFFFFFFFFFFFull].len = 0; + } + + void* alloc(size_t bytes); + void free(void* ptr); + + void* base() const { return pool; } + size_t size() const { return length; } + size_t remaining() const { return total_free; } +}; + +#endif diff --git a/runtime/hsa-runtime/core/util/timer.cpp b/runtime/hsa-runtime/core/util/timer.cpp new file mode 100644 index 0000000000..1fa275b49b --- /dev/null +++ b/runtime/hsa-runtime/core/util/timer.cpp @@ -0,0 +1,105 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#include "core/util/timer.h" + +namespace timer { + +accurate_clock::init::init() { + freq = os::AccurateClockFrequency(); + accurate_clock::period_ns = 1e9 / double(freq); +} + +// Calibrates the fast clock using the accurate clock. +fast_clock::init::init() { + typedef accurate_clock clock; + clock::duration delay(std::chrono::milliseconds(1)); + + // calibrate clock + fast_clock::raw_rep min = 0; + clock::duration elapsed = clock::duration::max(); + + do { + for (int t = 0; t < 10; t++) { + fast_clock::raw_rep r1, r2; + clock::time_point t0, t1, t2, t3; + + t0 = clock::now(); + std::atomic_signal_fence(std::memory_order_acq_rel); + r1 = fast_clock::raw_now(); + std::atomic_signal_fence(std::memory_order_acq_rel); + t1 = clock::now(); + std::atomic_signal_fence(std::memory_order_acq_rel); + + do { + t2 = clock::now(); + } while (t2 - t1 < delay); + + std::atomic_signal_fence(std::memory_order_acq_rel); + r2 = fast_clock::raw_now(); + std::atomic_signal_fence(std::memory_order_acq_rel); + t3 = clock::now(); + + // If elapsed time is shorter than last recorded time and both the start + // and end times are confirmed correlated then record the clock readings. + // This protects against inaccuracy due to thread switching + if ((t3 - t1 < elapsed) && ((t1 - t0) * 10 < (t2 - t1)) && + ((t3 - t2) * 10 < (t2 - t1))) { + elapsed = t3 - t1; + min = r2 - r1; + } + } + delay += delay; + } while (min < 1000); + + fast_clock::freq = double(min) / duration_in_seconds(elapsed); + fast_clock::period_ps = 1e12 / fast_clock::freq; +} + +double accurate_clock::period_ns; +accurate_clock::raw_frequency accurate_clock::freq; +accurate_clock::init accurate_clock::accurate_clock_init; + +double fast_clock::period_ps; +fast_clock::raw_frequency fast_clock::freq; +fast_clock::init fast_clock::fast_clock_init; +} diff --git a/runtime/hsa-runtime/core/util/timer.h b/runtime/hsa-runtime/core/util/timer.h new file mode 100644 index 0000000000..bec1d2c178 --- /dev/null +++ b/runtime/hsa-runtime/core/util/timer.h @@ -0,0 +1,162 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef HSA_RUNTIME_CORE_UTIL_TIMER_H_ +#define HSA_RUNTIME_CORE_UTIL_TIMER_H_ + +#include "core/util/utils.h" +#include "core/util/os.h" +#include + +#include + +namespace timer { + +// Needed to patch around a mixed arithmetic bug in MSVC's duration_cast as of +// VS 2013. +template +struct wide_type { + typedef double type; +}; +template <> +struct wide_type { + typedef uintmax_t type; +}; +template <> +struct wide_type { + typedef intmax_t type; +}; + +template +static __forceinline To + duration_cast(const std::chrono::duration& d) { + typedef typename wide_type::value, + std::is_signed::value>::type wide; + typedef std::chrono::duration unit_convert_t; + + unit_convert_t temp = std::chrono::duration_cast(d); + return To(static_cast(temp.count())); +} +// End patch + +template +static __forceinline double duration_in_seconds( + std::chrono::duration delta) { + typedef std::chrono::duration> seconds; + return seconds(delta).count(); +} + +template +static __forceinline rep duration_from_seconds(double delta) { + typedef std::chrono::duration> seconds; + return std::chrono::duration_cast(seconds(delta)); +} + +// Provices a C++11 standard clock interface to the os::AccurateClock functions +class accurate_clock { + public: + typedef double rep; + typedef std::nano period; + typedef std::chrono::duration duration; + typedef std::chrono::time_point time_point; + + static const bool is_steady = true; + + static __forceinline time_point now() { + return time_point(duration(raw_now() * period_ns)); + } + + // These two extra APIs and types let us use clocks without conversion to the + // arbitrary period unit + typedef uint64_t raw_rep; + typedef uint64_t raw_frequency; + + static __forceinline raw_rep raw_now() { return os::ReadAccurateClock(); } + static __forceinline raw_frequency raw_freq() { return freq; } + + private: + static double period_ns; + static raw_frequency freq; + + class init { + public: + init(); + }; + static init accurate_clock_init; +}; + +// Provices a C++11 standard clock interface to the lowest latency approximate +// clock +class fast_clock { + public: + typedef double rep; + typedef std::pico period; + typedef std::chrono::duration duration; + typedef std::chrono::time_point time_point; + + static const bool is_steady = true; + + static __forceinline time_point now() { + return time_point(duration(raw_now() * period_ps)); + } + + // These two extra APIs and types let us use clocks without conversion to the + // arbitrary period unit + typedef uint64_t raw_rep; + typedef double raw_frequency; + + static __forceinline raw_rep raw_now() { return __rdtsc(); } + static __forceinline raw_frequency raw_freq() { return freq; } + + private: + static double period_ps; + static raw_frequency freq; + + class init { + public: + init(); + }; + static init fast_clock_init; +}; +} + +#endif diff --git a/runtime/hsa-runtime/core/util/utils.h b/runtime/hsa-runtime/core/util/utils.h new file mode 100644 index 0000000000..7b3b7ad101 --- /dev/null +++ b/runtime/hsa-runtime/core/util/utils.h @@ -0,0 +1,267 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +// Generally useful utility functions + +#ifndef HSA_RUNTIME_CORE_UTIL_UTILS_H_ +#define HSA_RUNTIME_CORE_UTIL_UTILS_H_ + +#include "stdint.h" +#include "stddef.h" +#include "stdlib.h" +#include + +typedef unsigned int uint; +typedef uint64_t uint64; + +#if defined(__GNUC__) +#include "mm_malloc.h" +#if defined(__i386__) || defined(__x86_64__) +#include +#else +#error \ + "Processor or compiler not identified. " \ + "Need to provide a lightweight approximate clock interface via function uint64_t __rdtsc() or adapt timer.h to your platform." +#endif + +#define __forceinline __inline__ __attribute__((always_inline)) +static __forceinline void __debugbreak() { __builtin_trap(); } +#define __declspec(x) __attribute__((x)) +#undef __stdcall +#define __stdcall // __attribute__((__stdcall__)) +#define __ALIGNED__(x) __attribute__((aligned(x))) + +static __forceinline void* _aligned_malloc(size_t size, size_t alignment) { + return _mm_malloc(size, alignment); +} +static __forceinline void _aligned_free(void* ptr) { return _mm_free(ptr); } +#elif defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64)) +#include "intrin.h" +#define __ALIGNED__(x) __declspec(align(x)) +#if (_MSC_VER < 1800) +static __forceinline unsigned long long int strtoull(const char* str, + char** endptr, int base) { + return static_cast(_strtoui64(str, endptr, base)); +} +#endif +#else +#error "Compiler and/or processor not identified." +#endif + +#define STRING2(x) #x +#define STRING(x) STRING2(x) + +#define PASTE2(x, y) x##y +#define PASTE(x, y) PASTE2(x, y) + +// A macro to disallow the copy and move constructor and operator= functions +// This should be used in the private: declarations for a class +#define DISALLOW_COPY_AND_ASSIGN(TypeName) \ + TypeName(const TypeName&); \ + TypeName(TypeName&&); \ + void operator=(const TypeName&); \ + void operator=(TypeName&&); + +template +class ScopeGuard { + public: + explicit __forceinline ScopeGuard(const lambda& release) + : release_(release), dismiss_(false) {} + + ScopeGuard(ScopeGuard& rhs) { *this = rhs; } + + __forceinline ~ScopeGuard() { + if (!dismiss_) release_(); + } + __forceinline ScopeGuard& operator=(ScopeGuard& rhs) { + dismiss_ = rhs.dismiss_; + release_ = rhs.release_; + rhs.dismiss_ = true; + } + __forceinline void Dismiss() { dismiss_ = true; } + + private: + lambda release_; + bool dismiss_; +}; + +template +static __forceinline ScopeGuard MakeScopeGuard(lambda rel) { + return ScopeGuard(rel); +} + +#define MAKE_SCOPE_GUARD_HELPER(lname, sname, ...) \ + auto lname = __VA_ARGS__; \ + ScopeGuard sname(lname); +#define MAKE_SCOPE_GUARD(...) \ + MAKE_SCOPE_GUARD_HELPER(PASTE(scopeGuardLambda, __COUNTER__), \ + PASTE(scopeGuard, __COUNTER__), __VA_ARGS__) +#define MAKE_NAMED_SCOPE_GUARD(name, ...) \ + MAKE_SCOPE_GUARD_HELPER(PASTE(scopeGuardLambda, __COUNTER__), name, \ + __VA_ARGS__) + +/// @brief: Finds out the min one of two inputs, input must support ">" +/// operator. +/// @param: a(Input), a reference to type T. +/// @param: b(Input), a reference to type T. +/// @return: T. +template +static __forceinline T Min(const T& a, const T& b) { + return (a > b) ? b : a; +} + +/// @brief: Find out the max one of two inputs, input must support ">" operator. +/// @param: a(Input), a reference to type T. +/// @param: b(Input), a reference to type T. +/// @return: T. +template +static __forceinline T Max(const T& a, const T& b) { + return (b > a) ? b : a; +} + +/// @brief: Free the memory space which is newed previously. +/// @param: ptr(Input), a pointer to memory space. Can't be NULL. +/// @return: void. +struct DeleteObject { + template + void operator()(const T* ptr) const { + delete ptr; + } +}; + +/// @brief: Checks if a value is power of two, if it is, return true. Be careful +/// when passing 0. +/// @param: val(Input), the data to be checked. +/// @return: bool. +template +static __forceinline bool IsPowerOfTwo(T val) { + return (val & (val - 1)) == 0; +} + +/// @brief: Calculates the floor value aligned based on parameter of alignment. +/// If value is at the boundary of alignment, it is unchanged. +/// @param: value(Input), value to be calculated. +/// @param: alignment(Input), alignment value. +/// @return: T. +template +static __forceinline T AlignDown(T value, size_t alignment) { + assert(IsPowerOfTwo(alignment)); + return (T)(value & ~(alignment - 1)); +} + +/// @brief: Same as previous one, but first parameter becomes pointer, for more +/// info, see the previous desciption. +/// @param: value(Input), pointer to type T. +/// @param: alignment(Input), alignment value. +/// @return: T*, pointer to type T. +template +static __forceinline T* AlignDown(T* value, size_t alignment) { + return (T*)AlignDown((intptr_t)value, alignment); +} + +/// @brief: Calculates the ceiling value aligned based on parameter of +/// alignment. +/// If value is at the boundary of alignment, it is unchanged. +/// @param: value(Input), value to be calculated. +/// @param: alignment(Input), alignment value. +/// @param: T. +template +static __forceinline T AlignUp(T value, size_t alignment) { + return AlignDown((T)(value + alignment - 1), alignment); +} + +/// @brief: Same as previous one, but first parameter becomes pointer, for more +/// info, see the previous desciption. +/// @param: value(Input), pointer to type T. +/// @param: alignment(Input), alignment value. +/// @return: T*, pointer to type T. +template +static __forceinline T* AlignUp(T* value, size_t alignment) { + return (T*)AlignDown((intptr_t)((uint8_t*)value + alignment - 1), alignment); +} + +/// @brief: Checks if the input value is at the boundary of alignment, if it is, +/// @return true. +/// @param: value(Input), value to be checked. +/// @param: alignment(Input), alignment value. +/// @return: bool. +template +static __forceinline bool IsMultipleOf(T value, size_t alignment) { + return (AlignUp(value, alignment) == value); +} + +/// @brief: Same as previous one, but first parameter becomes pointer, for more +/// info, see the previous desciption. +/// @param: value(Input), pointer to type T. +/// @param: alignment(Input), alignment value. +/// @return: bool. +template +static __forceinline bool IsMultipleOf(T* value, size_t alignment) { + return (AlignUp(value, alignment) == value); +} + +static __forceinline uint32_t NextPow2(uint32_t value) { + if (value == 0) return 1; + uint32_t v = value - 1; + v |= v >> 1; + v |= v >> 2; + v |= v >> 4; + v |= v >> 8; + v |= v >> 16; + return v + 1; +} + +static __forceinline uint64_t NextPow2(uint64_t value) { + if (value == 0) return 1; + uint64_t v = value - 1; + v |= v >> 1; + v |= v >> 2; + v |= v >> 4; + v |= v >> 8; + v |= v >> 16; + v |= v >> 32; + return v + 1; +} + +#include "atomic_helpers.h" + +#endif // HSA_RUNTIME_CORE_UTIL_UTIIS_H_ diff --git a/runtime/hsa-runtime/inc/Brig.h b/runtime/hsa-runtime/inc/Brig.h new file mode 100644 index 0000000000..1e441b3251 --- /dev/null +++ b/runtime/hsa-runtime/inc/Brig.h @@ -0,0 +1,1530 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +//.ignore{ + +#ifndef INCLUDED_BRIG_H +#define INCLUDED_BRIG_H + +#include + +enum BrigAuxDefs { + MAX_OPERANDS_NUM = 6 +}; + +//} + +typedef uint32_t BrigVersion32_t; + +enum BrigVersion { + + //.nowrap + //.nodump + //.nollvm + + BRIG_VERSION_HSAIL_MAJOR = 1, + BRIG_VERSION_HSAIL_MINOR = 0, + BRIG_VERSION_BRIG_MAJOR = 1, + BRIG_VERSION_BRIG_MINOR = 0 +}; + +typedef uint8_t BrigAlignment8_t; //.defValue=BRIG_ALIGNMENT_NONE + +typedef uint8_t BrigAllocation8_t; //.defValue=BRIG_ALLOCATION_NONE + +typedef uint8_t BrigAluModifier8_t; + +typedef uint8_t BrigAtomicOperation8_t; + +typedef uint32_t BrigCodeOffset32_t; //.defValue=0 //.wtype=ItemRef + +typedef uint8_t BrigCompareOperation8_t; + +typedef uint16_t BrigControlDirective16_t; + +typedef uint32_t BrigDataOffset32_t; + +typedef BrigDataOffset32_t BrigDataOffsetCodeList32_t; //.wtype=ListRef //.defValue=0 + +typedef BrigDataOffset32_t BrigDataOffsetOperandList32_t; //.wtype=ListRef //.defValue=0 + +typedef BrigDataOffset32_t BrigDataOffsetString32_t; //.wtype=StrRef //.defValue=0 + +typedef uint8_t BrigExecutableModifier8_t; + +typedef uint8_t BrigImageChannelOrder8_t; //.defValue=BRIG_CHANNEL_ORDER_UNKNOWN + +typedef uint8_t BrigImageChannelType8_t; //.defValue=BRIG_CHANNEL_TYPE_UNKNOWN + +typedef uint8_t BrigImageGeometry8_t; //.defValue=BRIG_GEOMETRY_UNKNOWN + +typedef uint8_t BrigImageQuery8_t; + +typedef uint16_t BrigKind16_t; + +typedef uint8_t BrigLinkage8_t; //.defValue=BRIG_LINKAGE_NONE + +typedef uint8_t BrigMachineModel8_t; //.defValue=BRIG_MACHINE_LARGE + +typedef uint8_t BrigMemoryModifier8_t; + +typedef uint8_t BrigMemoryOrder8_t; //.defValue=BRIG_MEMORY_ORDER_RELAXED + +typedef uint8_t BrigMemoryScope8_t; //.defValue=BRIG_MEMORY_SCOPE_SYSTEM + +typedef uint16_t BrigOpcode16_t; + +typedef uint32_t BrigOperandOffset32_t; //.defValue=0 //.wtype=ItemRef + +typedef uint8_t BrigPack8_t; //.defValue=BRIG_PACK_NONE + +typedef uint8_t BrigProfile8_t; //.defValue=BRIG_PROFILE_FULL + +typedef uint16_t BrigRegisterKind16_t; + +typedef uint8_t BrigRound8_t; //.defValue=BRIG_ROUND_NONE + +typedef uint8_t BrigSamplerAddressing8_t; //.defValue=BRIG_ADDRESSING_CLAMP_TO_EDGE + +typedef uint8_t BrigSamplerCoordNormalization8_t; + +typedef uint8_t BrigSamplerFilter8_t; + +typedef uint8_t BrigSamplerQuery8_t; + +typedef uint32_t BrigSectionIndex32_t; + +typedef uint8_t BrigSegCvtModifier8_t; + +typedef uint8_t BrigSegment8_t; //.defValue=BRIG_SEGMENT_NONE + +typedef uint32_t BrigStringOffset32_t; //.defValue=0 //.wtype=StrRef + +typedef uint16_t BrigType16_t; + +typedef uint8_t BrigVariableModifier8_t; + +typedef uint8_t BrigWidth8_t; + +typedef uint32_t BrigExceptions32_t; + +enum BrigKind { + + //.wname={ s/^BRIG_KIND//; MACRO2Name($_) } + //.mnemo=$wname{ $wname } + // + //.sizeof=$wname{ "sizeof(".$structs->{"Brig".$wname}->{rawbrig}.")" } + //.sizeof_switch //.sizeof_proto="int size_of_brig_record(unsigned arg)" //.sizeof_default="return -1" + // + //.isBodyOnly={ "false" } + //.isBodyOnly_switch //.isBodyOnly_proto="bool isBodyOnly(Directive d)" //.isBodyOnly_arg="d.kind()" + //.isBodyOnly_default="assert(false); return false" + // + //.isToplevelOnly={ "false" } + //.isToplevelOnly_switch //.isToplevelOnly_proto="bool isToplevelOnly(Directive d)" //.isToplevelOnly_arg="d.kind()" + //.isToplevelOnly_default="assert(false); return false" + + BRIG_KIND_NONE = 0x0000, //.skip + + BRIG_KIND_DIRECTIVE_BEGIN = 0x1000, //.skip + BRIG_KIND_DIRECTIVE_ARG_BLOCK_END = 0x1000, //.isBodyOnly=true + BRIG_KIND_DIRECTIVE_ARG_BLOCK_START = 0x1001, //.isBodyOnly=true + BRIG_KIND_DIRECTIVE_COMMENT = 0x1002, + BRIG_KIND_DIRECTIVE_CONTROL = 0x1003, //.isBodyOnly=true + BRIG_KIND_DIRECTIVE_EXTENSION = 0x1004, //.isToplevelOnly=true + BRIG_KIND_DIRECTIVE_FBARRIER = 0x1005, + BRIG_KIND_DIRECTIVE_FUNCTION = 0x1006, //.isToplevelOnly=true + BRIG_KIND_DIRECTIVE_INDIRECT_FUNCTION = 0x1007, //.isToplevelOnly=true + BRIG_KIND_DIRECTIVE_KERNEL = 0x1008, //.isToplevelOnly=true + BRIG_KIND_DIRECTIVE_LABEL = 0x1009, //.isBodyOnly=true + BRIG_KIND_DIRECTIVE_LOC = 0x100a, + BRIG_KIND_DIRECTIVE_MODULE = 0x100b, //.isToplevelOnly=true + BRIG_KIND_DIRECTIVE_PRAGMA = 0x100c, + BRIG_KIND_DIRECTIVE_SIGNATURE = 0x100d, //.isToplevelOnly=true + BRIG_KIND_DIRECTIVE_VARIABLE = 0x100e, + BRIG_KIND_DIRECTIVE_END = 0x100f, //.skip + + BRIG_KIND_INST_BEGIN = 0x2000, //.skip + BRIG_KIND_INST_ADDR = 0x2000, + BRIG_KIND_INST_ATOMIC = 0x2001, + BRIG_KIND_INST_BASIC = 0x2002, + BRIG_KIND_INST_BR = 0x2003, + BRIG_KIND_INST_CMP = 0x2004, + BRIG_KIND_INST_CVT = 0x2005, + BRIG_KIND_INST_IMAGE = 0x2006, + BRIG_KIND_INST_LANE = 0x2007, + BRIG_KIND_INST_MEM = 0x2008, + BRIG_KIND_INST_MEM_FENCE = 0x2009, + BRIG_KIND_INST_MOD = 0x200a, + BRIG_KIND_INST_QUERY_IMAGE = 0x200b, + BRIG_KIND_INST_QUERY_SAMPLER = 0x200c, + BRIG_KIND_INST_QUEUE = 0x200d, + BRIG_KIND_INST_SEG = 0x200e, + BRIG_KIND_INST_SEG_CVT = 0x200f, + BRIG_KIND_INST_SIGNAL = 0x2010, + BRIG_KIND_INST_SOURCE_TYPE = 0x2011, + BRIG_KIND_INST_END = 0x2012, //.skip + + BRIG_KIND_OPERAND_BEGIN = 0x3000, //.skip + BRIG_KIND_OPERAND_ADDRESS = 0x3000, + BRIG_KIND_OPERAND_ALIGN = 0x3001, + BRIG_KIND_OPERAND_CODE_LIST = 0x3002, + BRIG_KIND_OPERAND_CODE_REF = 0x3003, + BRIG_KIND_OPERAND_CONSTANT_BYTES = 0x3004, + BRIG_KIND_OPERAND_RESERVED = 0x3005, //.skip + BRIG_KIND_OPERAND_CONSTANT_IMAGE = 0x3006, + BRIG_KIND_OPERAND_CONSTANT_OPERAND_LIST = 0x3007, + BRIG_KIND_OPERAND_CONSTANT_SAMPLER = 0x3008, + BRIG_KIND_OPERAND_OPERAND_LIST = 0x3009, + BRIG_KIND_OPERAND_REGISTER = 0x300a, + BRIG_KIND_OPERAND_STRING = 0x300b, + BRIG_KIND_OPERAND_WAVESIZE = 0x300c, + BRIG_KIND_OPERAND_END = 0x300d //.skip +}; + +enum BrigAlignment { + + //.mnemo={ s/^BRIG_ALIGNMENT_//; lc } + //.mnemo_proto="const char* align2str(unsigned arg)" + // + //.bytes={ /(\d+)/ ? $1 : undef } + //.bytes_switch //.bytes_proto="unsigned align2num(unsigned arg)" //.bytes_default="assert(false); return -1" + // + //.rbytes=$bytes{ $bytes } + //.rbytes_switch //.rbytes_reverse //.rbytes_proto="BrigAlignment num2align(uint64_t arg)" + //.rbytes_default="return BRIG_ALIGNMENT_LAST" + // + //.print=$bytes{ $bytes>1 ? "_align($bytes)" : "" } + + BRIG_ALIGNMENT_NONE = 0, //.no_mnemo + BRIG_ALIGNMENT_1 = 1, //.mnemo="" + BRIG_ALIGNMENT_2 = 2, + BRIG_ALIGNMENT_4 = 3, + BRIG_ALIGNMENT_8 = 4, + BRIG_ALIGNMENT_16 = 5, + BRIG_ALIGNMENT_32 = 6, + BRIG_ALIGNMENT_64 = 7, + BRIG_ALIGNMENT_128 = 8, + BRIG_ALIGNMENT_256 = 9, + + BRIG_ALIGNMENT_LAST, //.skip + BRIG_ALIGNMENT_MAX = BRIG_ALIGNMENT_LAST - 1 //.skip +}; + +enum BrigAllocation { + + //.mnemo={ s/^BRIG_ALLOCATION_//;lc } + //.mnemo_token=EAllocKind + + BRIG_ALLOCATION_NONE = 0, //.mnemo="" + BRIG_ALLOCATION_PROGRAM = 1, + BRIG_ALLOCATION_AGENT = 2, + BRIG_ALLOCATION_AUTOMATIC = 3 +}; + +enum BrigAluModifierMask { + BRIG_ALU_FTZ = 1 +}; + +enum BrigAtomicOperation { + + //.tdcaption="Atomic Operations" + // + //.mnemo={ s/^BRIG_ATOMIC_//;lc } + //.mnemo_token=_EMAtomicOp + //.mnemo_context=EInstModifierInstAtomicContext + // + //.print=$mnemo{ "_$mnemo" } + + BRIG_ATOMIC_ADD = 0, + BRIG_ATOMIC_AND = 1, + BRIG_ATOMIC_CAS = 2, + BRIG_ATOMIC_EXCH = 3, + BRIG_ATOMIC_LD = 4, + BRIG_ATOMIC_MAX = 5, + BRIG_ATOMIC_MIN = 6, + BRIG_ATOMIC_OR = 7, + BRIG_ATOMIC_ST = 8, + BRIG_ATOMIC_SUB = 9, + BRIG_ATOMIC_WRAPDEC = 10, + BRIG_ATOMIC_WRAPINC = 11, + BRIG_ATOMIC_XOR = 12, + BRIG_ATOMIC_WAIT_EQ = 13, + BRIG_ATOMIC_WAIT_NE = 14, + BRIG_ATOMIC_WAIT_LT = 15, + BRIG_ATOMIC_WAIT_GTE = 16, + BRIG_ATOMIC_WAITTIMEOUT_EQ = 17, + BRIG_ATOMIC_WAITTIMEOUT_NE = 18, + BRIG_ATOMIC_WAITTIMEOUT_LT = 19, + BRIG_ATOMIC_WAITTIMEOUT_GTE = 20 +}; + +enum BrigCompareOperation { + + //.tdcaption="Comparison Operators" + // + //.mnemo={ s/^BRIG_COMPARE_//;lc } + //.mnemo_token=_EMCompare + // + //.print=$mnemo{ "_$mnemo" } + + BRIG_COMPARE_EQ = 0, + BRIG_COMPARE_NE = 1, + BRIG_COMPARE_LT = 2, + BRIG_COMPARE_LE = 3, + BRIG_COMPARE_GT = 4, + BRIG_COMPARE_GE = 5, + BRIG_COMPARE_EQU = 6, + BRIG_COMPARE_NEU = 7, + BRIG_COMPARE_LTU = 8, + BRIG_COMPARE_LEU = 9, + BRIG_COMPARE_GTU = 10, + BRIG_COMPARE_GEU = 11, + BRIG_COMPARE_NUM = 12, + BRIG_COMPARE_NAN = 13, + BRIG_COMPARE_SEQ = 14, + BRIG_COMPARE_SNE = 15, + BRIG_COMPARE_SLT = 16, + BRIG_COMPARE_SLE = 17, + BRIG_COMPARE_SGT = 18, + BRIG_COMPARE_SGE = 19, + BRIG_COMPARE_SGEU = 20, + BRIG_COMPARE_SEQU = 21, + BRIG_COMPARE_SNEU = 22, + BRIG_COMPARE_SLTU = 23, + BRIG_COMPARE_SLEU = 24, + BRIG_COMPARE_SNUM = 25, + BRIG_COMPARE_SNAN = 26, + BRIG_COMPARE_SGTU = 27 +}; + +enum BrigControlDirective { + + //.mnemo={ s/^BRIG_CONTROL_//;lc } + //.mnemo_token=EControl + // + //.print=$mnemo{ $mnemo } + + BRIG_CONTROL_NONE = 0, //.skip + BRIG_CONTROL_ENABLEBREAKEXCEPTIONS = 1, + BRIG_CONTROL_ENABLEDETECTEXCEPTIONS = 2, + BRIG_CONTROL_MAXDYNAMICGROUPSIZE = 3, + BRIG_CONTROL_MAXFLATGRIDSIZE = 4, + BRIG_CONTROL_MAXFLATWORKGROUPSIZE = 5, + BRIG_CONTROL_REQUIREDDIM = 6, + BRIG_CONTROL_REQUIREDGRIDSIZE = 7, + BRIG_CONTROL_REQUIREDWORKGROUPSIZE = 8, + BRIG_CONTROL_REQUIRENOPARTIALWORKGROUPS = 9 +}; + +enum BrigExecutableModifierMask { + //.nodump + BRIG_EXECUTABLE_DEFINITION = 1 +}; + +enum BrigImageChannelOrder { + + //.mnemo={ s/^BRIG_CHANNEL_ORDER_?//;lc } + //.mnemo_token=EImageOrder + //.mnemo_context=EImageOrderContext + // + //.print=$mnemo{ $mnemo } + + BRIG_CHANNEL_ORDER_A = 0, + BRIG_CHANNEL_ORDER_R = 1, + BRIG_CHANNEL_ORDER_RX = 2, + BRIG_CHANNEL_ORDER_RG = 3, + BRIG_CHANNEL_ORDER_RGX = 4, + BRIG_CHANNEL_ORDER_RA = 5, + BRIG_CHANNEL_ORDER_RGB = 6, + BRIG_CHANNEL_ORDER_RGBX = 7, + BRIG_CHANNEL_ORDER_RGBA = 8, + BRIG_CHANNEL_ORDER_BGRA = 9, + BRIG_CHANNEL_ORDER_ARGB = 10, + BRIG_CHANNEL_ORDER_ABGR = 11, + BRIG_CHANNEL_ORDER_SRGB = 12, + BRIG_CHANNEL_ORDER_SRGBX = 13, + BRIG_CHANNEL_ORDER_SRGBA = 14, + BRIG_CHANNEL_ORDER_SBGRA = 15, + BRIG_CHANNEL_ORDER_INTENSITY = 16, + BRIG_CHANNEL_ORDER_LUMINANCE = 17, + BRIG_CHANNEL_ORDER_DEPTH = 18, + BRIG_CHANNEL_ORDER_DEPTH_STENCIL = 19, + + // used internally + BRIG_CHANNEL_ORDER_UNKNOWN, //.mnemo="" // used when no order is specified + + BRIG_CHANNEL_ORDER_FIRST_USER_DEFINED = 128 //.skip + +}; + +enum BrigImageChannelType { + + //.mnemo={ s/^BRIG_CHANNEL_TYPE_//;lc } + //.mnemo_token=EImageFormat + // + //.print=$mnemo{ $mnemo } + + BRIG_CHANNEL_TYPE_SNORM_INT8 = 0, + BRIG_CHANNEL_TYPE_SNORM_INT16 = 1, + BRIG_CHANNEL_TYPE_UNORM_INT8 = 2, + BRIG_CHANNEL_TYPE_UNORM_INT16 = 3, + BRIG_CHANNEL_TYPE_UNORM_INT24 = 4, + BRIG_CHANNEL_TYPE_UNORM_SHORT_555 = 5, + BRIG_CHANNEL_TYPE_UNORM_SHORT_565 = 6, + BRIG_CHANNEL_TYPE_UNORM_INT_101010 = 7, + BRIG_CHANNEL_TYPE_SIGNED_INT8 = 8, + BRIG_CHANNEL_TYPE_SIGNED_INT16 = 9, + BRIG_CHANNEL_TYPE_SIGNED_INT32 = 10, + BRIG_CHANNEL_TYPE_UNSIGNED_INT8 = 11, + BRIG_CHANNEL_TYPE_UNSIGNED_INT16 = 12, + BRIG_CHANNEL_TYPE_UNSIGNED_INT32 = 13, + BRIG_CHANNEL_TYPE_HALF_FLOAT = 14, + BRIG_CHANNEL_TYPE_FLOAT = 15, + + // used internally + BRIG_CHANNEL_TYPE_UNKNOWN, //.mnemo="" + + BRIG_CHANNEL_TYPE_FIRST_USER_DEFINED = 128 //.skip +}; + +enum BrigImageGeometry { + + //.tdcaption="Geometry" + // + //.mnemo={ s/^BRIG_GEOMETRY_//;lc } + //.mnemo_token=EImageGeometry + // + //.dim={/_([0-9]+D)(A)?/ ? $1+(defined $2?1:0) : undef} + //.dim_switch //.dim_proto="unsigned getBrigGeometryDim(unsigned geo)" //.dim_arg="geo" + //.dim_default="assert(0); return 0" + // + //.depth={/DEPTH$/?"true":"false"} + //.depth_switch //.depth_proto="bool isBrigGeometryDepth(unsigned geo)" //.depth_arg="geo" + //.depth_default="return false" + + BRIG_GEOMETRY_1D = 0, + BRIG_GEOMETRY_2D = 1, + BRIG_GEOMETRY_3D = 2, + BRIG_GEOMETRY_1DA = 3, + BRIG_GEOMETRY_2DA = 4, + BRIG_GEOMETRY_1DB = 5, + BRIG_GEOMETRY_2DDEPTH = 6, + BRIG_GEOMETRY_2DADEPTH = 7, + + // used internally + BRIG_GEOMETRY_UNKNOWN, //.mnemo="" + + BRIG_GEOMETRY_FIRST_USER_DEFINED = 128 //.skip +}; + +enum BrigImageQuery { + + //.mnemo={ s/^BRIG_IMAGE_QUERY_//;lc } + // + //.print=$mnemo{ $mnemo } + + BRIG_IMAGE_QUERY_WIDTH = 0, + BRIG_IMAGE_QUERY_HEIGHT = 1, + BRIG_IMAGE_QUERY_DEPTH = 2, + BRIG_IMAGE_QUERY_ARRAY = 3, + BRIG_IMAGE_QUERY_CHANNELORDER = 4, + BRIG_IMAGE_QUERY_CHANNELTYPE = 5, + + BRIG_IMAGE_QUERY_FIRST_USER_DEFINED = 6 //.skip +}; + +enum BrigLinkage { + + //.mnemo={ s/^BRIG_LINKAGE_//;s/NONE//;lc } + + BRIG_LINKAGE_NONE = 0, + BRIG_LINKAGE_PROGRAM = 1, + BRIG_LINKAGE_MODULE = 2, + BRIG_LINKAGE_FUNCTION = 3, + BRIG_LINKAGE_ARG = 4 +}; + +enum BrigMachineModel { + + //.mnemo={ s/^BRIG_MACHINE_//; '$'.lc } + //.mnemo_token=ETargetMachine + // + //.print=$mnemo{ $mnemo } + + BRIG_MACHINE_SMALL = 0, + BRIG_MACHINE_LARGE = 1, + + BRIG_MACHINE_UNDEF = 2 //.skip +}; + +enum BrigMemoryModifierMask { + BRIG_MEMORY_CONST = 1 +}; + +enum BrigMemoryOrder { + + //.mnemo={ s/^BRIG_MEMORY_ORDER_//; lc } + //.mnemo_token=_EMMemoryOrder + // + //.print=$mnemo{ "_$mnemo" } + + BRIG_MEMORY_ORDER_NONE = 0, //.mnemo="" + BRIG_MEMORY_ORDER_RELAXED = 1, //.mnemo=rlx + BRIG_MEMORY_ORDER_SC_ACQUIRE = 2, //.mnemo=scacq + BRIG_MEMORY_ORDER_SC_RELEASE = 3, //.mnemo=screl + BRIG_MEMORY_ORDER_SC_ACQUIRE_RELEASE = 4, //.mnemo=scar + + BRIG_MEMORY_ORDER_LAST = 5 //.skip +}; + +enum BrigMemoryScope { + + //.mnemo={ s/^BRIG_MEMORY_SCOPE_//; lc } + //.mnemo_token=_EMMemoryScope + // + //.print=$mnemo{ $mnemo } + + BRIG_MEMORY_SCOPE_NONE = 0, //.mnemo="" + BRIG_MEMORY_SCOPE_WORKITEM = 1, //.mnemo="" + BRIG_MEMORY_SCOPE_WAVEFRONT = 2, //.mnemo=wave + BRIG_MEMORY_SCOPE_WORKGROUP = 3, //.mnemo=wg + BRIG_MEMORY_SCOPE_AGENT = 4, //.mnemo=agent + BRIG_MEMORY_SCOPE_SYSTEM = 5, //.mnemo=system + + BRIG_MEMORY_SCOPE_LAST = 6 //.skip +}; + +enum BrigOpcode { + + //.tdcaption="Instruction Opcodes" + // + //.k={ "BASIC" } + //.pscode=$k{ MACRO2Name("_".$k) } + //.opcodeparser=$pscode{ return $pscode && "parseMnemo$pscode" } + //.opcodeparser_incfile=ParserUtilities + //.opcodeparser_switch //.opcodeparser_proto="OpcodeParser getCoreOpcodeParser(BrigOpcode16_t arg)" //.opcodeparser_default="return parseMnemoBasic" + // + //.psopnd={undef} + //.opndparser=$psopnd{ return $psopnd && "&Parser::parse$psopnd" } + //.opndparser_incfile=ParserUtilities + //.opndparser_switch //.opndparser_proto="Parser::OperandParser Parser::getCoreOperandParser(BrigOpcode16_t arg)" //.opndparser_default="return &Parser::parseOperands" + // + //.mnemo={ s/^BRIG_OPCODE_//; lc } + //.mnemo_scanner=Instructions //.mnemo_token=EInstruction + //.mnemo_context=EDefaultContext + // + //.vecOpndIndex={undef} + //.vecOpndIndex_switch //.vecOpndIndex_proto="int getCoreVXIndex(BrigOpcode16_t arg)" //.vecOpndIndex_default="return -1" + // + //.numdst={undef} + //.numdst_switch //.numdst_proto="int getCoreDstOperandsNum(BrigOpcode16_t arg)" //.numdst_default="return 1" + // + //.print=$mnemo{ $mnemo } + + BRIG_OPCODE_NOP = 0, //.k=NOP + BRIG_OPCODE_ABS = 1, //.k=BASIC_OR_MOD + BRIG_OPCODE_ADD = 2, //.k=BASIC_OR_MOD + BRIG_OPCODE_BORROW = 3, + BRIG_OPCODE_CARRY = 4, + BRIG_OPCODE_CEIL = 5, //.k=BASIC_OR_MOD + BRIG_OPCODE_COPYSIGN = 6, //.k=BASIC_OR_MOD + BRIG_OPCODE_DIV = 7, //.k=BASIC_OR_MOD + BRIG_OPCODE_FLOOR = 8, //.k=BASIC_OR_MOD + BRIG_OPCODE_FMA = 9, //.k=BASIC_OR_MOD + BRIG_OPCODE_FRACT = 10, //.k=BASIC_OR_MOD + BRIG_OPCODE_MAD = 11, //.k=BASIC_OR_MOD + BRIG_OPCODE_MAX = 12, //.k=BASIC_OR_MOD + BRIG_OPCODE_MIN = 13, //.k=BASIC_OR_MOD + BRIG_OPCODE_MUL = 14, //.k=BASIC_OR_MOD + BRIG_OPCODE_MULHI = 15, //.k=BASIC_OR_MOD + BRIG_OPCODE_NEG = 16, //.k=BASIC_OR_MOD + BRIG_OPCODE_REM = 17, + BRIG_OPCODE_RINT = 18, //.k=BASIC_OR_MOD + BRIG_OPCODE_SQRT = 19, //.k=BASIC_OR_MOD + BRIG_OPCODE_SUB = 20, //.k=BASIC_OR_MOD + BRIG_OPCODE_TRUNC = 21, //.k=BASIC_OR_MOD + BRIG_OPCODE_MAD24 = 22, + BRIG_OPCODE_MAD24HI = 23, + BRIG_OPCODE_MUL24 = 24, + BRIG_OPCODE_MUL24HI = 25, + BRIG_OPCODE_SHL = 26, + BRIG_OPCODE_SHR = 27, + BRIG_OPCODE_AND = 28, + BRIG_OPCODE_NOT = 29, + BRIG_OPCODE_OR = 30, + BRIG_OPCODE_POPCOUNT = 31, //.k=SOURCE_TYPE + BRIG_OPCODE_XOR = 32, + BRIG_OPCODE_BITEXTRACT = 33, + BRIG_OPCODE_BITINSERT = 34, + BRIG_OPCODE_BITMASK = 35, + BRIG_OPCODE_BITREV = 36, + BRIG_OPCODE_BITSELECT = 37, + BRIG_OPCODE_FIRSTBIT = 38, //.k=SOURCE_TYPE + BRIG_OPCODE_LASTBIT = 39, //.k=SOURCE_TYPE + BRIG_OPCODE_COMBINE = 40, //.k=SOURCE_TYPE //.vecOpndIndex=1 + BRIG_OPCODE_EXPAND = 41, //.k=SOURCE_TYPE //.vecOpndIndex=0 + BRIG_OPCODE_LDA = 42, //.k=ADDR + BRIG_OPCODE_MOV = 43, + BRIG_OPCODE_SHUFFLE = 44, + BRIG_OPCODE_UNPACKHI = 45, + BRIG_OPCODE_UNPACKLO = 46, + BRIG_OPCODE_PACK = 47, //.k=SOURCE_TYPE + BRIG_OPCODE_UNPACK = 48, //.k=SOURCE_TYPE + BRIG_OPCODE_CMOV = 49, + BRIG_OPCODE_CLASS = 50, //.k=SOURCE_TYPE + BRIG_OPCODE_NCOS = 51, + BRIG_OPCODE_NEXP2 = 52, + BRIG_OPCODE_NFMA = 53, + BRIG_OPCODE_NLOG2 = 54, + BRIG_OPCODE_NRCP = 55, + BRIG_OPCODE_NRSQRT = 56, + BRIG_OPCODE_NSIN = 57, + BRIG_OPCODE_NSQRT = 58, + BRIG_OPCODE_BITALIGN = 59, + BRIG_OPCODE_BYTEALIGN = 60, + BRIG_OPCODE_PACKCVT = 61, //.k=SOURCE_TYPE + BRIG_OPCODE_UNPACKCVT = 62, //.k=SOURCE_TYPE + BRIG_OPCODE_LERP = 63, + BRIG_OPCODE_SAD = 64, //.k=SOURCE_TYPE + BRIG_OPCODE_SADHI = 65, //.k=SOURCE_TYPE + BRIG_OPCODE_SEGMENTP = 66, //.k=SEG_CVT + BRIG_OPCODE_FTOS = 67, //.k=SEG_CVT + BRIG_OPCODE_STOF = 68, //.k=SEG_CVT + BRIG_OPCODE_CMP = 69, //.k=CMP + BRIG_OPCODE_CVT = 70, //.k=CVT + BRIG_OPCODE_LD = 71, //.k=MEM //.vecOpndIndex=0 + BRIG_OPCODE_ST = 72, //.k=MEM //.vecOpndIndex=0 //.numdst=0 + BRIG_OPCODE_ATOMIC = 73, //.k=ATOMIC + BRIG_OPCODE_ATOMICNORET = 74, //.k=ATOMIC //.numdst=0 + BRIG_OPCODE_SIGNAL = 75, //.k=SIGNAL + BRIG_OPCODE_SIGNALNORET = 76, //.k=SIGNAL //.numdst=0 + BRIG_OPCODE_MEMFENCE = 77, //.k=MEM_FENCE //.numdst=0 + BRIG_OPCODE_RDIMAGE = 78, //.skip // NB: handled by IMAGE extension + BRIG_OPCODE_LDIMAGE = 79, //.skip // NB: handled by IMAGE extension + BRIG_OPCODE_STIMAGE = 80, //.skip // NB: handled by IMAGE extension + BRIG_OPCODE_IMAGEFENCE = 81, //.skip // NB: handled by IMAGE extension + BRIG_OPCODE_QUERYIMAGE = 82, //.skip // NB: handled by IMAGE extension + BRIG_OPCODE_QUERYSAMPLER = 83, //.skip // NB: handled by IMAGE extension + BRIG_OPCODE_CBR = 84, //.k=BR //.numdst=0 + BRIG_OPCODE_BR = 85, //.k=BR //.numdst=0 + BRIG_OPCODE_SBR = 86, //.k=BR //.numdst=0 //.psopnd=SbrOperands + BRIG_OPCODE_BARRIER = 87, //.k=BR //.numdst=0 + BRIG_OPCODE_WAVEBARRIER = 88, //.k=BR //.numdst=0 + BRIG_OPCODE_ARRIVEFBAR = 89, //.k=BR //.numdst=0 + BRIG_OPCODE_INITFBAR = 90, //.k=BASIC_NO_TYPE //.numdst=0 + BRIG_OPCODE_JOINFBAR = 91, //.k=BR //.numdst=0 + BRIG_OPCODE_LEAVEFBAR = 92, //.k=BR //.numdst=0 + BRIG_OPCODE_RELEASEFBAR = 93, //.k=BASIC_NO_TYPE //.numdst=0 + BRIG_OPCODE_WAITFBAR = 94, //.k=BR //.numdst=0 + BRIG_OPCODE_LDF = 95, + BRIG_OPCODE_ACTIVELANECOUNT = 96, //.k=LANE + BRIG_OPCODE_ACTIVELANEID = 97, //.k=LANE + BRIG_OPCODE_ACTIVELANEMASK = 98, //.k=LANE //.vecOpndIndex=0 + BRIG_OPCODE_ACTIVELANEPERMUTE = 99, //.k=LANE + BRIG_OPCODE_CALL = 100, //.k=BR //.psopnd=CallOperands //.numdst=0 + BRIG_OPCODE_SCALL = 101, //.k=BR //.psopnd=CallOperands //.numdst=0 + BRIG_OPCODE_ICALL = 102, //.k=BR //.psopnd=CallOperands //.numdst=0 + BRIG_OPCODE_RET = 103, //.k=BASIC_NO_TYPE + BRIG_OPCODE_ALLOCA = 104, //.k=MEM + BRIG_OPCODE_CURRENTWORKGROUPSIZE = 105, + BRIG_OPCODE_CURRENTWORKITEMFLATID = 106, + BRIG_OPCODE_DIM = 107, + BRIG_OPCODE_GRIDGROUPS = 108, + BRIG_OPCODE_GRIDSIZE = 109, + BRIG_OPCODE_PACKETCOMPLETIONSIG = 110, + BRIG_OPCODE_PACKETID = 111, + BRIG_OPCODE_WORKGROUPID = 112, + BRIG_OPCODE_WORKGROUPSIZE = 113, + BRIG_OPCODE_WORKITEMABSID = 114, + BRIG_OPCODE_WORKITEMFLATABSID = 115, + BRIG_OPCODE_WORKITEMFLATID = 116, + BRIG_OPCODE_WORKITEMID = 117, + BRIG_OPCODE_CLEARDETECTEXCEPT = 118, //.numdst=0 + BRIG_OPCODE_GETDETECTEXCEPT = 119, + BRIG_OPCODE_SETDETECTEXCEPT = 120, //.numdst=0 + BRIG_OPCODE_ADDQUEUEWRITEINDEX = 121, //.k=QUEUE + BRIG_OPCODE_CASQUEUEWRITEINDEX = 122, //.k=QUEUE + BRIG_OPCODE_LDQUEUEREADINDEX = 123, //.k=QUEUE + BRIG_OPCODE_LDQUEUEWRITEINDEX = 124, //.k=QUEUE + BRIG_OPCODE_STQUEUEREADINDEX = 125, //.k=QUEUE //.numdst=0 + BRIG_OPCODE_STQUEUEWRITEINDEX = 126, //.k=QUEUE //.numdst=0 + BRIG_OPCODE_CLOCK = 127, + BRIG_OPCODE_CUID = 128, + BRIG_OPCODE_DEBUGTRAP = 129, //.numdst=0 + BRIG_OPCODE_GROUPBASEPTR = 130, + BRIG_OPCODE_KERNARGBASEPTR = 131, + BRIG_OPCODE_LANEID = 132, + BRIG_OPCODE_MAXCUID = 133, + BRIG_OPCODE_MAXWAVEID = 134, + BRIG_OPCODE_NULLPTR = 135, //.k=SEG + BRIG_OPCODE_WAVEID = 136, + BRIG_OPCODE_FIRST_USER_DEFINED = 32768, //.skip +}; + +enum BrigPack { + + //.tdcaption="Packing" + // + //.mnemo={ s/^BRIG_PACK_//;s/SAT$/_sat/;lc } + //.mnemo_token=_EMPacking + // + //.print=$mnemo{ "_$mnemo" } + + BRIG_PACK_NONE = 0, //.mnemo="" + BRIG_PACK_PP = 1, + BRIG_PACK_PS = 2, + BRIG_PACK_SP = 3, + BRIG_PACK_SS = 4, + BRIG_PACK_S = 5, + BRIG_PACK_P = 6, + BRIG_PACK_PPSAT = 7, + BRIG_PACK_PSSAT = 8, + BRIG_PACK_SPSAT = 9, + BRIG_PACK_SSSAT = 10, + BRIG_PACK_SSAT = 11, + BRIG_PACK_PSAT = 12 +}; + +enum BrigProfile { + + //.mnemo={ s/^BRIG_PROFILE_//;'$'.lc } + //.mnemo_token=ETargetProfile + // + //.print=$mnemo{ $mnemo } + + BRIG_PROFILE_BASE = 0, + BRIG_PROFILE_FULL = 1, + + BRIG_PROFILE_UNDEF = 2 //.skip +}; + +enum BrigRegisterKind { + + //.mnemo={ s/^BRIG_REGISTER_KIND_//;'$'.lc(substr($_,0,1)) } + // + //.bits={ } + //.bits_switch //.bits_proto="unsigned getRegBits(BrigRegisterKind16_t arg)" //.bits_default="return (unsigned)-1" + // + //.nollvm + + BRIG_REGISTER_KIND_CONTROL = 0, //.bits=1 + BRIG_REGISTER_KIND_SINGLE = 1, //.bits=32 + BRIG_REGISTER_KIND_DOUBLE = 2, //.bits=64 + BRIG_REGISTER_KIND_QUAD = 3 //.bits=128 +}; + +enum BrigRound { + + //.mnemo={} + //.mnemo_fn=round2str //.mnemo_token=_EMRound + // + //.sat={/_SAT$/? "true" : "false"} + //.sat_switch //.sat_proto="bool isSatRounding(unsigned rounding)" //.sat_arg="rounding" + //.sat_default="return false" + // + //.sig={/_SIGNALING_/? "true" : "false"} + //.sig_switch //.sig_proto="bool isSignalingRounding(unsigned rounding)" //.sig_arg="rounding" + //.sig_default="return false" + // + //.int={/_INTEGER_/? "true" : "false"} + //.int_switch //.int_proto="bool isIntRounding(unsigned rounding)" //.int_arg="rounding" + //.int_default="return false" + // + //.flt={/_FLOAT_/? "true" : "false"} + //.flt_switch //.flt_proto="bool isFloatRounding(unsigned rounding)" //.flt_arg="rounding" + //.flt_default="return false" + // + //.print=$mnemo{ "_$mnemo" } + + BRIG_ROUND_NONE = 0, //.no_mnemo + BRIG_ROUND_FLOAT_DEFAULT = 1, //.no_mnemo + BRIG_ROUND_FLOAT_NEAR_EVEN = 2, //.mnemo=near + BRIG_ROUND_FLOAT_ZERO = 3, //.mnemo=zero + BRIG_ROUND_FLOAT_PLUS_INFINITY = 4, //.mnemo=up + BRIG_ROUND_FLOAT_MINUS_INFINITY = 5, //.mnemo=down + BRIG_ROUND_INTEGER_NEAR_EVEN = 6, //.mnemo=neari + BRIG_ROUND_INTEGER_ZERO = 7, //.mnemo=zeroi + BRIG_ROUND_INTEGER_PLUS_INFINITY = 8, //.mnemo=upi + BRIG_ROUND_INTEGER_MINUS_INFINITY = 9, //.mnemo=downi + BRIG_ROUND_INTEGER_NEAR_EVEN_SAT = 10, //.mnemo=neari_sat + BRIG_ROUND_INTEGER_ZERO_SAT = 11, //.mnemo=zeroi_sat + BRIG_ROUND_INTEGER_PLUS_INFINITY_SAT = 12, //.mnemo=upi_sat + BRIG_ROUND_INTEGER_MINUS_INFINITY_SAT = 13, //.mnemo=downi_sat + BRIG_ROUND_INTEGER_SIGNALING_NEAR_EVEN = 14, //.mnemo=sneari + BRIG_ROUND_INTEGER_SIGNALING_ZERO = 15, //.mnemo=szeroi + BRIG_ROUND_INTEGER_SIGNALING_PLUS_INFINITY = 16, //.mnemo=supi + BRIG_ROUND_INTEGER_SIGNALING_MINUS_INFINITY = 17, //.mnemo=sdowni + BRIG_ROUND_INTEGER_SIGNALING_NEAR_EVEN_SAT = 18, //.mnemo=sneari_sat + BRIG_ROUND_INTEGER_SIGNALING_ZERO_SAT = 19, //.mnemo=szeroi_sat + BRIG_ROUND_INTEGER_SIGNALING_PLUS_INFINITY_SAT = 20, //.mnemo=supi_sat + BRIG_ROUND_INTEGER_SIGNALING_MINUS_INFINITY_SAT = 21 //.mnemo=sdowni_sat +}; + +enum BrigSamplerAddressing { + + //.mnemo={ s/^BRIG_ADDRESSING_//;lc } + //.mnemo_token=ESamplerAddressingMode + + BRIG_ADDRESSING_UNDEFINED = 0, + BRIG_ADDRESSING_CLAMP_TO_EDGE = 1, + BRIG_ADDRESSING_CLAMP_TO_BORDER = 2, + BRIG_ADDRESSING_REPEAT = 3, + BRIG_ADDRESSING_MIRRORED_REPEAT = 4, + + BRIG_ADDRESSING_FIRST_USER_DEFINED = 128 //.skip +}; + +enum BrigSamplerCoordNormalization { + + //.mnemo={ s/^BRIG_COORD_//;lc } + //.mnemo_token=ESamplerCoord + // + //.print=$mnemo{ $mnemo } + + BRIG_COORD_UNNORMALIZED = 0, + BRIG_COORD_NORMALIZED = 1 +}; + +enum BrigSamplerFilter { + + //.mnemo={ s/^BRIG_FILTER_//;lc } + // + //.print=$mnemo{ $mnemo } + + BRIG_FILTER_NEAREST = 0, + BRIG_FILTER_LINEAR = 1, + + BRIG_FILTER_FIRST_USER_DEFINED = 128 //.skip +}; + +enum BrigSamplerQuery { + + //.mnemo={ s/^BRIG_SAMPLER_QUERY_//;lc } + //.mnemo_token=_EMSamplerQuery + // + //.print=$mnemo{ $mnemo } + + BRIG_SAMPLER_QUERY_ADDRESSING = 0, + BRIG_SAMPLER_QUERY_COORD = 1, + BRIG_SAMPLER_QUERY_FILTER = 2 +}; + +enum BrigSectionIndex { + + //.nollvm + // + //.mnemo={ s/^BRIG_SECTION_INDEX_/HSA_/;lc } + + BRIG_SECTION_INDEX_DATA = 0, + BRIG_SECTION_INDEX_CODE = 1, + BRIG_SECTION_INDEX_OPERAND = 2, + BRIG_SECTION_INDEX_BEGIN_IMPLEMENTATION_DEFINED = 3, + + // used internally + BRIG_SECTION_INDEX_IMPLEMENTATION_DEFINED = BRIG_SECTION_INDEX_BEGIN_IMPLEMENTATION_DEFINED //.skip +}; + +enum BrigSegCvtModifierMask { + BRIG_SEG_CVT_NONULL = 1 //.mnemo="nonull" //.print="_nonull" +}; + +enum BrigSegment { + + //.mnemo={ s/^BRIG_SEGMENT_//;lc} + //.mnemo_token=_EMSegment + //.mnemo_context=EInstModifierContext + // + //.print=$mnemo{ $mnemo ? "_$mnemo" : "" } + + BRIG_SEGMENT_NONE = 0, //.mnemo="" + BRIG_SEGMENT_FLAT = 1, //.mnemo="" + BRIG_SEGMENT_GLOBAL = 2, + BRIG_SEGMENT_READONLY = 3, + BRIG_SEGMENT_KERNARG = 4, + BRIG_SEGMENT_GROUP = 5, + BRIG_SEGMENT_PRIVATE = 6, + BRIG_SEGMENT_SPILL = 7, + BRIG_SEGMENT_ARG = 8, + + BRIG_SEGMENT_FIRST_USER_DEFINED = 128 //.skip +}; + +enum BrigPackedTypeBits { + + //.nodump + // + //.nollvm + + BRIG_TYPE_BASE_SIZE = 5, + BRIG_TYPE_PACK_SIZE = 2, + BRIG_TYPE_ARRAY_SIZE = 1, + + BRIG_TYPE_BASE_SHIFT = 0, + BRIG_TYPE_PACK_SHIFT = BRIG_TYPE_BASE_SHIFT + BRIG_TYPE_BASE_SIZE, + BRIG_TYPE_ARRAY_SHIFT = BRIG_TYPE_PACK_SHIFT + BRIG_TYPE_PACK_SIZE, + + BRIG_TYPE_BASE_MASK = ((1 << BRIG_TYPE_BASE_SIZE) - 1) << BRIG_TYPE_BASE_SHIFT, + BRIG_TYPE_PACK_MASK = ((1 << BRIG_TYPE_PACK_SIZE) - 1) << BRIG_TYPE_PACK_SHIFT, + BRIG_TYPE_ARRAY_MASK = ((1 << BRIG_TYPE_ARRAY_SIZE) - 1) << BRIG_TYPE_ARRAY_SHIFT, + + BRIG_TYPE_PACK_NONE = 0 << BRIG_TYPE_PACK_SHIFT, + BRIG_TYPE_PACK_32 = 1 << BRIG_TYPE_PACK_SHIFT, + BRIG_TYPE_PACK_64 = 2 << BRIG_TYPE_PACK_SHIFT, + BRIG_TYPE_PACK_128 = 3 << BRIG_TYPE_PACK_SHIFT, + + BRIG_TYPE_ARRAY = 1 << BRIG_TYPE_ARRAY_SHIFT +}; + +enum BrigType { + + //.numBits={ /ARRAY$/ ? undef : /([0-9]+)X([0-9]+)/ ? $1*$2 : /([0-9]+)/ ? $1 : undef } + //.numBits_switch //.numBits_proto="unsigned getBrigTypeNumBits(unsigned arg)" //.numBits_default="assert(0); return 0" + //.numBytes=$numBits{ $numBits > 1 ? $numBits/8 : undef } + //.numBytes_switch //.numBytes_proto="unsigned getBrigTypeNumBytes(unsigned arg)" //.numBytes_default="assert(0); return 0" + // + //.mnemo={ s/^BRIG_TYPE_//;lc } + //.mnemo_token=_EMType + // + //.array={/ARRAY$/?"true":"false"} + //.array_switch //.array_proto="bool isArrayType(unsigned type)" //.array_arg="type" + //.array_default="return false" + // + //.a2e={/(.*)_ARRAY$/? $1 : "BRIG_TYPE_NONE"} + //.a2e_switch //.a2e_proto="unsigned arrayType2elementType(unsigned type)" //.a2e_arg="type" + //.a2e_default="return BRIG_TYPE_NONE" + // + //.e2a={/_ARRAY$/? "BRIG_TYPE_NONE" : /_NONE$/ ? "BRIG_TYPE_NONE" : /_B1$/ ? "BRIG_TYPE_NONE" : $_ . "_ARRAY"} + //.e2a_switch //.e2a_proto="unsigned elementType2arrayType(unsigned type)" //.e2a_arg="type" + //.e2a_default="return BRIG_TYPE_NONE" + // + //.t2s={s/^BRIG_TYPE_//;lc s/_ARRAY$/[]/;lc} + //.t2s_switch //.t2s_proto="const char* type2name(unsigned type)" //.t2s_arg="type" + //.t2s_default="return NULL" + // + //.dispatch_switch //.dispatch_incfile=TemplateUtilities + //.dispatch_proto="template\nRetType dispatchByType_gen(unsigned type, Visitor& v)" + //.dispatch={ /ARRAY$/ ? "v.visitNone(type)" : /^BRIG_TYPE_([BUSF]|SIG)[0-9]+/ ? "v.template visit< BrigTypeTraits<$_> >()" : "v.visitNone(type)" } + //.dispatch_arg="type" //.dispatch_default="return v.visitNone(type)" + // + //- .tdname=BrigType + // + //.print=$mnemo{ "_$mnemo" } + + BRIG_TYPE_NONE = 0, //.mnemo="" //.print="" + BRIG_TYPE_U8 = 1, //.ctype=uint8_t + BRIG_TYPE_U16 = 2, //.ctype=uint16_t + BRIG_TYPE_U32 = 3, //.ctype=uint32_t + BRIG_TYPE_U64 = 4, //.ctype=uint64_t + BRIG_TYPE_S8 = 5, //.ctype=int8_t + BRIG_TYPE_S16 = 6, //.ctype=int16_t + BRIG_TYPE_S32 = 7, //.ctype=int32_t + BRIG_TYPE_S64 = 8, //.ctype=int64_t + BRIG_TYPE_F16 = 9, //.ctype=f16_t + BRIG_TYPE_F32 = 10, //.ctype=float + BRIG_TYPE_F64 = 11, //.ctype=double + BRIG_TYPE_B1 = 12, //.ctype=bool //.numBytes=1 + BRIG_TYPE_B8 = 13, //.ctype=uint8_t + BRIG_TYPE_B16 = 14, //.ctype=uint16_t + BRIG_TYPE_B32 = 15, //.ctype=uint32_t + BRIG_TYPE_B64 = 16, //.ctype=uint64_t + BRIG_TYPE_B128 = 17, //.ctype=b128_t + BRIG_TYPE_SAMP = 18, //.mnemo=samp //.numBits=64 + BRIG_TYPE_ROIMG = 19, //.mnemo=roimg //.numBits=64 + BRIG_TYPE_WOIMG = 20, //.mnemo=woimg //.numBits=64 + BRIG_TYPE_RWIMG = 21, //.mnemo=rwimg //.numBits=64 + BRIG_TYPE_SIG32 = 22, //.mnemo=sig32 //.numBits=64 + BRIG_TYPE_SIG64 = 23, //.mnemo=sig64 //.numBits=64 + + BRIG_TYPE_U8X4 = BRIG_TYPE_U8 | BRIG_TYPE_PACK_32, //.ctype=uint8_t + BRIG_TYPE_U8X8 = BRIG_TYPE_U8 | BRIG_TYPE_PACK_64, //.ctype=uint8_t + BRIG_TYPE_U8X16 = BRIG_TYPE_U8 | BRIG_TYPE_PACK_128, //.ctype=uint8_t + BRIG_TYPE_U16X2 = BRIG_TYPE_U16 | BRIG_TYPE_PACK_32, //.ctype=uint16_t + BRIG_TYPE_U16X4 = BRIG_TYPE_U16 | BRIG_TYPE_PACK_64, //.ctype=uint16_t + BRIG_TYPE_U16X8 = BRIG_TYPE_U16 | BRIG_TYPE_PACK_128, //.ctype=uint16_t + BRIG_TYPE_U32X2 = BRIG_TYPE_U32 | BRIG_TYPE_PACK_64, //.ctype=uint32_t + BRIG_TYPE_U32X4 = BRIG_TYPE_U32 | BRIG_TYPE_PACK_128, //.ctype=uint32_t + BRIG_TYPE_U64X2 = BRIG_TYPE_U64 | BRIG_TYPE_PACK_128, //.ctype=uint64_t + BRIG_TYPE_S8X4 = BRIG_TYPE_S8 | BRIG_TYPE_PACK_32, //.ctype=int8_t + BRIG_TYPE_S8X8 = BRIG_TYPE_S8 | BRIG_TYPE_PACK_64, //.ctype=int8_t + BRIG_TYPE_S8X16 = BRIG_TYPE_S8 | BRIG_TYPE_PACK_128, //.ctype=int8_t + BRIG_TYPE_S16X2 = BRIG_TYPE_S16 | BRIG_TYPE_PACK_32, //.ctype=int16_t + BRIG_TYPE_S16X4 = BRIG_TYPE_S16 | BRIG_TYPE_PACK_64, //.ctype=int16_t + BRIG_TYPE_S16X8 = BRIG_TYPE_S16 | BRIG_TYPE_PACK_128, //.ctype=int16_t + BRIG_TYPE_S32X2 = BRIG_TYPE_S32 | BRIG_TYPE_PACK_64, //.ctype=int32_t + BRIG_TYPE_S32X4 = BRIG_TYPE_S32 | BRIG_TYPE_PACK_128, //.ctype=int32_t + BRIG_TYPE_S64X2 = BRIG_TYPE_S64 | BRIG_TYPE_PACK_128, //.ctype=int64_t + BRIG_TYPE_F16X2 = BRIG_TYPE_F16 | BRIG_TYPE_PACK_32, //.ctype=f16_t + BRIG_TYPE_F16X4 = BRIG_TYPE_F16 | BRIG_TYPE_PACK_64, //.ctype=f16_t + BRIG_TYPE_F16X8 = BRIG_TYPE_F16 | BRIG_TYPE_PACK_128, //.ctype=f16_t + BRIG_TYPE_F32X2 = BRIG_TYPE_F32 | BRIG_TYPE_PACK_64, //.ctype=float + BRIG_TYPE_F32X4 = BRIG_TYPE_F32 | BRIG_TYPE_PACK_128, //.ctype=float + BRIG_TYPE_F64X2 = BRIG_TYPE_F64 | BRIG_TYPE_PACK_128, //.ctype=double + + BRIG_TYPE_U8_ARRAY = BRIG_TYPE_U8 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_U16_ARRAY = BRIG_TYPE_U16 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_U32_ARRAY = BRIG_TYPE_U32 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_U64_ARRAY = BRIG_TYPE_U64 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_S8_ARRAY = BRIG_TYPE_S8 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_S16_ARRAY = BRIG_TYPE_S16 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_S32_ARRAY = BRIG_TYPE_S32 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_S64_ARRAY = BRIG_TYPE_S64 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_F16_ARRAY = BRIG_TYPE_F16 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_F32_ARRAY = BRIG_TYPE_F32 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_F64_ARRAY = BRIG_TYPE_F64 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_B8_ARRAY = BRIG_TYPE_B8 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_B16_ARRAY = BRIG_TYPE_B16 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_B32_ARRAY = BRIG_TYPE_B32 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_B64_ARRAY = BRIG_TYPE_B64 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_B128_ARRAY = BRIG_TYPE_B128 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_SAMP_ARRAY = BRIG_TYPE_SAMP | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_ROIMG_ARRAY = BRIG_TYPE_ROIMG | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_WOIMG_ARRAY = BRIG_TYPE_WOIMG | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_RWIMG_ARRAY = BRIG_TYPE_RWIMG | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_SIG32_ARRAY = BRIG_TYPE_SIG32 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_SIG64_ARRAY = BRIG_TYPE_SIG64 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_U8X4_ARRAY = BRIG_TYPE_U8X4 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_U8X8_ARRAY = BRIG_TYPE_U8X8 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_U8X16_ARRAY = BRIG_TYPE_U8X16 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_U16X2_ARRAY = BRIG_TYPE_U16X2 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_U16X4_ARRAY = BRIG_TYPE_U16X4 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_U16X8_ARRAY = BRIG_TYPE_U16X8 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_U32X2_ARRAY = BRIG_TYPE_U32X2 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_U32X4_ARRAY = BRIG_TYPE_U32X4 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_U64X2_ARRAY = BRIG_TYPE_U64X2 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_S8X4_ARRAY = BRIG_TYPE_S8X4 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_S8X8_ARRAY = BRIG_TYPE_S8X8 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_S8X16_ARRAY = BRIG_TYPE_S8X16 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_S16X2_ARRAY = BRIG_TYPE_S16X2 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_S16X4_ARRAY = BRIG_TYPE_S16X4 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_S16X8_ARRAY = BRIG_TYPE_S16X8 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_S32X2_ARRAY = BRIG_TYPE_S32X2 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_S32X4_ARRAY = BRIG_TYPE_S32X4 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_S64X2_ARRAY = BRIG_TYPE_S64X2 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_F16X2_ARRAY = BRIG_TYPE_F16X2 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_F16X4_ARRAY = BRIG_TYPE_F16X4 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_F16X8_ARRAY = BRIG_TYPE_F16X8 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_F32X2_ARRAY = BRIG_TYPE_F32X2 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_F32X4_ARRAY = BRIG_TYPE_F32X4 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_F64X2_ARRAY = BRIG_TYPE_F64X2 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + + // Used internally + BRIG_TYPE_INVALID = (unsigned) -1 //.skip +}; + +enum BrigVariableModifierMask { + + //.nodump + + BRIG_VARIABLE_DEFINITION = 1, + BRIG_VARIABLE_CONST = 2 +}; + +enum BrigWidth { + + //.print={ s/^BRIG_WIDTH_//; "_width($_)" } + + BRIG_WIDTH_NONE = 0, + BRIG_WIDTH_1 = 1, + BRIG_WIDTH_2 = 2, + BRIG_WIDTH_4 = 3, + BRIG_WIDTH_8 = 4, + BRIG_WIDTH_16 = 5, + BRIG_WIDTH_32 = 6, + BRIG_WIDTH_64 = 7, + BRIG_WIDTH_128 = 8, + BRIG_WIDTH_256 = 9, + BRIG_WIDTH_512 = 10, + BRIG_WIDTH_1024 = 11, + BRIG_WIDTH_2048 = 12, + BRIG_WIDTH_4096 = 13, + BRIG_WIDTH_8192 = 14, + BRIG_WIDTH_16384 = 15, + BRIG_WIDTH_32768 = 16, + BRIG_WIDTH_65536 = 17, + BRIG_WIDTH_131072 = 18, + BRIG_WIDTH_262144 = 19, + BRIG_WIDTH_524288 = 20, + BRIG_WIDTH_1048576 = 21, + BRIG_WIDTH_2097152 = 22, + BRIG_WIDTH_4194304 = 23, + BRIG_WIDTH_8388608 = 24, + BRIG_WIDTH_16777216 = 25, + BRIG_WIDTH_33554432 = 26, + BRIG_WIDTH_67108864 = 27, + BRIG_WIDTH_134217728 = 28, + BRIG_WIDTH_268435456 = 29, + BRIG_WIDTH_536870912 = 30, + BRIG_WIDTH_1073741824 = 31, + BRIG_WIDTH_2147483648 = 32, + BRIG_WIDTH_WAVESIZE = 33, + BRIG_WIDTH_ALL = 34, + + BRIG_WIDTH_LAST //.skip +}; + +struct BrigUInt64 { //.isroot //.standalone + uint32_t lo; //.defValue=0 + uint32_t hi; //.defValue=0 + + //+hcode KLASS& operator=(uint64_t rhs); + //+hcode operator uint64_t(); + //+implcode inline KLASS& KLASS::operator=(uint64_t rhs) { lo() = (uint32_t)rhs; hi() = (uint32_t)(rhs >> 32); return *this; } + //+implcode inline KLASS::operator uint64_t() { return ((uint64_t)hi()) << 32 | lo(); } +}; + +struct BrigAluModifier { //.isroot //.standalone + BrigAluModifier8_t allBits; //.defValue=0 + //^^ bool ftz; //.wtype=BitValRef<0> +}; + +struct BrigBase { //.nowrap + uint16_t byteCount; + BrigKind16_t kind; +}; + +//.alias Code:Base { //.generic //.isroot //.section=BRIG_SECTION_INDEX_CODE }; +//.alias Directive:Code { //.generic }; +//.alias Operand:Base { //.generic //.isroot //.section=BRIG_SECTION_INDEX_OPERAND }; + +struct BrigData { + //.nowrap + uint32_t byteCount; + uint8_t bytes[1]; +}; + +struct BrigExecutableModifier { //.isroot //.standalone + BrigExecutableModifier8_t allBits; //.defValue=0 + //^^ bool isDefinition; //.wtype=BitValRef<0> +}; + +struct BrigMemoryModifier { //.isroot //.standalone + BrigMemoryModifier8_t allBits; //.defValue=0 + //^^ bool isConst; //.wtype=BitValRef<0> +}; + +struct BrigSegCvtModifier { //.isroot //.standalone + BrigSegCvtModifier8_t allBits; //.defValue=0 + //^^ bool isNoNull; //.wtype=BitValRef<0> +}; + +struct BrigVariableModifier { //.isroot //.standalone + BrigVariableModifier8_t allBits; //.defValue=0 + + //^^ bool isDefinition; //.wtype=BitValRef<0> + //^^ bool isConst; //.wtype=BitValRef<1> +}; + +struct BrigDirectiveArgBlockEnd { + BrigBase base; +}; + +struct BrigDirectiveArgBlockStart { + BrigBase base; +}; + +struct BrigDirectiveComment { + BrigBase base; + BrigDataOffsetString32_t name; +}; + +struct BrigDirectiveControl { + BrigBase base; + BrigControlDirective16_t control; + uint16_t reserved; //.defValue=0 + BrigDataOffsetOperandList32_t operands; +}; + +struct BrigDirectiveExecutable { //.generic + BrigBase base; + BrigDataOffsetString32_t name; + uint16_t outArgCount; //.defValue=0 + uint16_t inArgCount; //.defValue=0 + BrigCodeOffset32_t firstInArg; + BrigCodeOffset32_t firstCodeBlockEntry; + BrigCodeOffset32_t nextModuleEntry; + BrigExecutableModifier modifier; //.acc=subItem //.wtype=ExecutableModifier + BrigLinkage8_t linkage; + uint16_t reserved; //.defValue=0 +}; + +//.alias DirectiveKernel:DirectiveExecutable { }; +//.alias DirectiveFunction:DirectiveExecutable { }; +//.alias DirectiveSignature:DirectiveExecutable { }; +//.alias DirectiveIndirectFunction:DirectiveExecutable { }; + +struct BrigDirectiveExtension { + BrigBase base; + BrigDataOffsetString32_t name; +}; + +struct BrigDirectiveFbarrier { + BrigBase base; + BrigDataOffsetString32_t name; + BrigVariableModifier modifier; //.acc=subItem //.wtype=VariableModifier + BrigLinkage8_t linkage; + uint16_t reserved; //.defValue=0 +}; + +struct BrigDirectiveLabel { + BrigBase base; + BrigDataOffsetString32_t name; +}; + +struct BrigDirectiveLoc { + BrigBase base; + BrigDataOffsetString32_t filename; + uint32_t line; + uint32_t column; //.defValue=1 +}; + +struct BrigDirectiveNone { //.enum=BRIG_KIND_NONE + BrigBase base; +}; + +struct BrigDirectivePragma { + BrigBase base; + BrigDataOffsetOperandList32_t operands; +}; + +struct BrigDirectiveVariable { + BrigBase base; + BrigDataOffsetString32_t name; + BrigOperandOffset32_t init; + BrigType16_t type; + + //+hcode bool isArray(); + //+implcode inline bool KLASS::isArray() { return isArrayType(type()); } + + //+hcode unsigned elementType(); + //+implcode inline unsigned KLASS::elementType() { return isArray()? arrayType2elementType(type()) : type(); } + + BrigSegment8_t segment; + BrigAlignment8_t align; + BrigUInt64 dim; //.acc=subItem //.wtype=UInt64 + BrigVariableModifier modifier; //.acc=subItem //.wtype=VariableModifier + BrigLinkage8_t linkage; + BrigAllocation8_t allocation; + uint8_t reserved; //.defValue=0 +}; + +struct BrigDirectiveModule { + BrigBase base; + BrigDataOffsetString32_t name; + BrigVersion32_t hsailMajor; //.wtype=ValRef + BrigVersion32_t hsailMinor; //.wtype=ValRef + BrigProfile8_t profile; + BrigMachineModel8_t machineModel; + BrigRound8_t defaultFloatRound; + uint8_t reserved; //.defValue=0 +}; + +struct BrigInstBase { //.wname=Inst //.generic //.parent=BrigCode + BrigBase base; + BrigOpcode16_t opcode; + BrigType16_t type; + BrigDataOffsetOperandList32_t operands; + + //+hcode Operand operand(int index); + //+implcode inline Operand KLASS::operand(int index) { return operands()[index]; } +}; + +struct BrigInstAddr { + BrigInstBase base; + BrigSegment8_t segment; + uint8_t reserved[3]; //.defValue=0 +}; + +struct BrigInstAtomic { + BrigInstBase base; + BrigSegment8_t segment; + BrigMemoryOrder8_t memoryOrder; + BrigMemoryScope8_t memoryScope; + BrigAtomicOperation8_t atomicOperation; + uint8_t equivClass; + uint8_t reserved[3]; //.defValue=0 +}; + +struct BrigInstBasic { + BrigInstBase base; +}; + +struct BrigInstBr { + BrigInstBase base; + BrigWidth8_t width; + uint8_t reserved[3]; //.defValue=0 +}; + +struct BrigInstCmp { + BrigInstBase base; + BrigType16_t sourceType; + BrigAluModifier modifier; //.acc=subItem //.wtype=AluModifier + BrigCompareOperation8_t compare; + BrigPack8_t pack; + uint8_t reserved[3]; //.defValue=0 +}; + +struct BrigInstCvt { + BrigInstBase base; + BrigType16_t sourceType; + BrigAluModifier modifier; //.acc=subItem //.wtype=AluModifier + BrigRound8_t round; +}; + +struct BrigInstImage { + BrigInstBase base; + BrigType16_t imageType; + BrigType16_t coordType; + BrigImageGeometry8_t geometry; + uint8_t equivClass; + uint16_t reserved; //.defValue=0 +}; + +struct BrigInstLane { + BrigInstBase base; + BrigType16_t sourceType; + BrigWidth8_t width; + uint8_t reserved; //.defValue=0 +}; + +struct BrigInstMem { + BrigInstBase base; + BrigSegment8_t segment; + BrigAlignment8_t align; + uint8_t equivClass; + BrigWidth8_t width; + BrigMemoryModifier modifier; //.acc=subItem //.wtype=MemoryModifier + uint8_t reserved[3]; //.defValue=0 +}; + +struct BrigInstMemFence { + BrigInstBase base; + BrigMemoryOrder8_t memoryOrder; + BrigMemoryScope8_t globalSegmentMemoryScope; + BrigMemoryScope8_t groupSegmentMemoryScope; + BrigMemoryScope8_t imageSegmentMemoryScope; +}; + +struct BrigInstMod { + BrigInstBase base; + BrigAluModifier modifier; //.acc=subItem //.wtype=AluModifier + BrigRound8_t round; + BrigPack8_t pack; + uint8_t reserved; //.defValue=0 +}; + +struct BrigInstQueryImage { + BrigInstBase base; + BrigType16_t imageType; + BrigImageGeometry8_t geometry; + BrigImageQuery8_t imageQuery; +}; + +struct BrigInstQuerySampler { + BrigInstBase base; + BrigSamplerQuery8_t samplerQuery; + uint8_t reserved[3]; //.defValue=0 +}; + +struct BrigInstQueue { + BrigInstBase base; + BrigSegment8_t segment; + BrigMemoryOrder8_t memoryOrder; + uint16_t reserved; //.defValue=0 +}; + +struct BrigInstSeg { + BrigInstBase base; + BrigSegment8_t segment; + uint8_t reserved[3]; //.defValue=0 +}; + +struct BrigInstSegCvt { + BrigInstBase base; + BrigType16_t sourceType; + BrigSegment8_t segment; + BrigSegCvtModifier modifier; //.acc=subItem //.wtype=SegCvtModifier +}; + +struct BrigInstSignal { + BrigInstBase base; + BrigType16_t signalType; + BrigMemoryOrder8_t memoryOrder; + BrigAtomicOperation8_t signalOperation; +}; + +struct BrigInstSourceType { + BrigInstBase base; + BrigType16_t sourceType; + uint16_t reserved; //.defValue=0 +}; + +struct BrigOperandAddress { + BrigBase base; + BrigCodeOffset32_t symbol; //.wtype=ItemRef + BrigOperandOffset32_t reg; //.wtype=ItemRef + BrigUInt64 offset; //.acc=subItem //.wtype=UInt64 +}; + +struct BrigOperandAlign { + BrigBase base; + BrigAlignment8_t align; + uint8_t reserved[3]; //.defValue=0 +}; + +struct BrigOperandCodeList { + BrigBase base; + BrigDataOffsetCodeList32_t elements; + + //+hcode unsigned elementCount(); + //+implcode inline unsigned KLASS::elementCount() { return elements().size(); } + //+hcode Code elements(int index); + //+implcode inline Code KLASS::elements(int index) { return elements()[index]; } +}; + +struct BrigOperandCodeRef { + BrigBase base; + BrigCodeOffset32_t ref; +}; + +struct BrigOperandConstantBytes { + BrigBase base; + BrigType16_t type; //.defValue=0 + uint16_t reserved; //.defValue=0 + BrigDataOffsetString32_t bytes; +}; + +struct BrigOperandConstantOperandList { + BrigBase base; + BrigType16_t type; + uint16_t reserved; //.defValue=0 + BrigDataOffsetOperandList32_t elements; + + //+hcode unsigned elementCount(); + //+implcode inline unsigned KLASS::elementCount() { return elements().size(); } + //+hcode Operand elements(int index); + //+implcode inline Operand KLASS::elements(int index) { return elements()[index]; } +}; + +struct BrigOperandConstantImage { + BrigBase base; + BrigType16_t type; + BrigImageGeometry8_t geometry; + BrigImageChannelOrder8_t channelOrder; + BrigImageChannelType8_t channelType; + uint8_t reserved[3]; //.defValue=0 + BrigUInt64 width; //.acc=subItem //.wtype=UInt64 + BrigUInt64 height; //.acc=subItem //.wtype=UInt64 + BrigUInt64 depth; //.acc=subItem //.wtype=UInt64 + BrigUInt64 array; //.acc=subItem //.wtype=UInt64 +}; + +struct BrigOperandOperandList { + BrigBase base; + BrigDataOffsetOperandList32_t elements; + + //+hcode unsigned elementCount(); + //+implcode inline unsigned KLASS::elementCount() { return elements().size(); } + //+hcode Operand elements(int index); + //+implcode inline Operand KLASS::elements(int index) { return elements()[index]; } +}; + +struct BrigOperandRegister { + BrigBase base; + BrigRegisterKind16_t regKind; + uint16_t regNum; +}; + +struct BrigOperandConstantSampler { + BrigBase base; + BrigType16_t type; + BrigSamplerCoordNormalization8_t coord; + BrigSamplerFilter8_t filter; + BrigSamplerAddressing8_t addressing; + uint8_t reserved[3]; //.defValue=0 +}; + +struct BrigOperandString { + BrigBase base; + BrigDataOffsetString32_t string; +}; + +struct BrigOperandWavesize { + BrigBase base; +}; + +//.ignore{ + +enum BrigExceptionsMask { + BRIG_EXCEPTIONS_INVALID_OPERATION = 1 << 0, + BRIG_EXCEPTIONS_DIVIDE_BY_ZERO = 1 << 1, + BRIG_EXCEPTIONS_OVERFLOW = 1 << 2, + BRIG_EXCEPTIONS_UNDERFLOW = 1 << 3, + BRIG_EXCEPTIONS_INEXACT = 1 << 4, + + BRIG_EXCEPTIONS_FIRST_USER_DEFINED = 1 << 16 +}; + +struct BrigSectionHeader { + uint64_t byteCount; + uint32_t headerByteCount; + uint32_t nameLength; + uint8_t name[1]; +}; + +#define MODULE_IDENTIFICATION_LENGTH (8) + +struct BrigModuleHeader { + char identification[MODULE_IDENTIFICATION_LENGTH]; + BrigVersion32_t brigMajor; + BrigVersion32_t brigMinor; + uint64_t byteCount; + uint8_t hash[64]; + uint32_t reserved; + uint32_t sectionCount; + uint64_t sectionIndex; +}; + +typedef BrigModuleHeader* BrigModule_t; + +#endif // defined(INCLUDED_BRIG_H) +//} diff --git a/runtime/hsa-runtime/inc/amd_hsa_common.h b/runtime/hsa-runtime/inc/amd_hsa_common.h new file mode 100644 index 0000000000..92aba8ed2e --- /dev/null +++ b/runtime/hsa-runtime/inc/amd_hsa_common.h @@ -0,0 +1,91 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +// The following set of header files provides definitions for AMD GPU +// Architecture: +// - amd_hsa_common.h +// - amd_hsa_elf.h +// - amd_hsa_kernel_code.h +// - amd_hsa_queue.h +// - amd_hsa_signal.h +// +// Refer to "HSA Application Binary Interface: AMD GPU Architecture" for more +// information. + +#ifndef AMD_HSA_COMMON_H +#define AMD_HSA_COMMON_H + +#include +#include + +// Descriptive version of the HSA Application Binary Interface. +#define AMD_HSA_ABI_VERSION "AMD GPU Architecture v0.35 (June 25, 2015)" + +// Alignment attribute that specifies a minimum alignment (in bytes) for +// variables of the specified type. +#if defined(__GNUC__) +# define __ALIGNED__(x) __attribute__((aligned(x))) +#elif defined(_MSC_VER) +# define __ALIGNED__(x) __declspec(align(x)) +#elif defined(RC_INVOKED) +# define __ALIGNED__(x) +#else +# error +#endif + +// Creates enumeration entries for packed types. Enumeration entries include +// bit shift amount, bit width, and bit mask. +#define AMD_HSA_BITS_CREATE_ENUM_ENTRIES(name, shift, width) \ + name ## _SHIFT = (shift), \ + name ## _WIDTH = (width), \ + name = (((1 << (width)) - 1) << (shift)) \ + +// Gets bits for specified mask from specified src packed instance. +#define AMD_HSA_BITS_GET(src, mask) \ + ((src & mask) >> mask ## _SHIFT) \ + +// Sets val bits for specified mask in specified dst packed instance. +#define AMD_HSA_BITS_SET(dst, mask, val) \ + dst &= (~(1 << mask ## _SHIFT) & ~mask); \ + dst |= (((val) << mask ## _SHIFT) & mask) \ + +#endif // AMD_HSA_COMMON_H diff --git a/runtime/hsa-runtime/inc/amd_hsa_elf.h b/runtime/hsa-runtime/inc/amd_hsa_elf.h new file mode 100644 index 0000000000..941aeeb389 --- /dev/null +++ b/runtime/hsa-runtime/inc/amd_hsa_elf.h @@ -0,0 +1,295 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef AMD_HSA_ELF_H +#define AMD_HSA_ELF_H + +#include "amd_hsa_common.h" + +// ELF Header Enumeration Values. +#define EM_AMDGPU 224 +#define ELFOSABI_AMDGPU_HSA 64 +#define ELFABIVERSION_AMDGPU_HSA 0 +#define EF_AMDGPU_XNACK 0x00000001 +#define EF_AMDGPU_TRAP_HANDLER 0x00000002 + +// ELF Section Header Flag Enumeration Values. +#define SHF_AMDGPU_HSA_GLOBAL (0x00100000 & SHF_MASKOS) +#define SHF_AMDGPU_HSA_READONLY (0x00200000 & SHF_MASKOS) +#define SHF_AMDGPU_HSA_CODE (0x00400000 & SHF_MASKOS) +#define SHF_AMDGPU_HSA_AGENT (0x00800000 & SHF_MASKOS) + +// +typedef enum { + AMDGPU_HSA_SEGMENT_GLOBAL_PROGRAM = 0, + AMDGPU_HSA_SEGMENT_GLOBAL_AGENT = 1, + AMDGPU_HSA_SEGMENT_READONLY_AGENT = 2, + AMDGPU_HSA_SEGMENT_CODE_AGENT = 3, + AMDGPU_HSA_SEGMENT_LAST, +} amdgpu_hsa_elf_segment_t; + +// ELF Program Header Type Enumeration Values. +#define PT_AMDGPU_HSA_LOAD_GLOBAL_PROGRAM (PT_LOOS + AMDGPU_HSA_SEGMENT_GLOBAL_PROGRAM) +#define PT_AMDGPU_HSA_LOAD_GLOBAL_AGENT (PT_LOOS + AMDGPU_HSA_SEGMENT_GLOBAL_AGENT) +#define PT_AMDGPU_HSA_LOAD_READONLY_AGENT (PT_LOOS + AMDGPU_HSA_SEGMENT_READONLY_AGENT) +#define PT_AMDGPU_HSA_LOAD_CODE_AGENT (PT_LOOS + AMDGPU_HSA_SEGMENT_CODE_AGENT) + +// ELF Symbol Type Enumeration Values. +#define STT_AMDGPU_HSA_KERNEL (STT_LOOS + 0) +#define STT_AMDGPU_HSA_INDIRECT_FUNCTION (STT_LOOS + 1) +#define STT_AMDGPU_HSA_METADATA (STT_LOOS + 2) + +// ELF Symbol Binding Enumeration Values. +#define STB_AMDGPU_HSA_EXTERNAL (STB_LOOS + 0) + +// ELF Symbol Other Information Creation/Retrieval. +#define ELF64_ST_AMDGPU_ALLOCATION(o) (((o) >> 2) & 0x3) +#define ELF64_ST_AMDGPU_FLAGS(o) ((o) >> 4) +#define ELF64_ST_AMDGPU_OTHER(f, a, v) (((f) << 4) + (((a) & 0x3) << 2) + ((v) & 0x3)) + +typedef enum { + AMDGPU_HSA_SYMBOL_ALLOCATION_DEFAULT = 0, + AMDGPU_HSA_SYMBOL_ALLOCATION_GLOBAL_PROGRAM = 1, + AMDGPU_HSA_SYMBOL_ALLOCATION_GLOBAL_AGENT = 2, + AMDGPU_HSA_SYMBOL_ALLOCATION_READONLY_AGENT = 3, + AMDGPU_HSA_SYMBOL_ALLOCATION_LAST, +} amdgpu_hsa_symbol_allocation_t; + +// ELF Symbol Allocation Enumeration Values. +#define STA_AMDGPU_HSA_DEFAULT AMDGPU_HSA_SYMBOL_ALLOCATION_DEFAULT +#define STA_AMDGPU_HSA_GLOBAL_PROGRAM AMDGPU_HSA_SYMBOL_ALLOCATION_GLOBAL_PROGRAM +#define STA_AMDGPU_HSA_GLOBAL_AGENT AMDGPU_HSA_SYMBOL_ALLOCATION_GLOBAL_AGENT +#define STA_AMDGPU_HSA_READONLY_AGENT AMDGPU_HSA_SYMBOL_ALLOCATION_READONLY_AGENT + +typedef enum { + AMDGPU_HSA_SYMBOL_FLAG_DEFAULT = 0, + AMDGPU_HSA_SYMBOL_FLAG_CONST = 1, + AMDGPU_HSA_SYMBOL_FLAG_LAST, +} amdgpu_hsa_symbol_flag_t; + +// ELF Symbol Flag Enumeration Values. +#define STF_AMDGPU_HSA_CONST AMDGPU_HSA_SYMBOL_FLAG_CONST + +// AMD GPU Relocation Type Enumeration Values. +#define R_AMDGPU_NONE 0 +#define R_AMDGPU_32_LOW 1 +#define R_AMDGPU_32_HIGH 2 +#define R_AMDGPU_64 3 +#define R_AMDGPU_INIT_SAMPLER 4 +#define R_AMDGPU_INIT_IMAGE 5 + +// AMD GPU Note Type Enumeration Values. +#define NT_AMDGPU_HSA_CODE_OBJECT_VERSION 1 +#define NT_AMDGPU_HSA_HSAIL 2 +#define NT_AMDGPU_HSA_ISA 3 +#define NT_AMDGPU_HSA_PRODUCER 4 +#define NT_AMDGPU_HSA_PRODUCER_OPTIONS 5 +#define NT_AMDGPU_HSA_EXTENSION 6 +#define NT_AMDGPU_HSA_HLDEBUG_DEBUG 101 +#define NT_AMDGPU_HSA_HLDEBUG_TARGET 102 + +// AMD GPU Metadata Kind Enumeration Values. +typedef uint16_t amdgpu_hsa_metadata_kind16_t; +typedef enum { + AMDGPU_HSA_METADATA_KIND_NONE = 0, + AMDGPU_HSA_METADATA_KIND_INIT_SAMP = 1, + AMDGPU_HSA_METADATA_KIND_INIT_ROIMG = 2, + AMDGPU_HSA_METADATA_KIND_INIT_WOIMG = 3, + AMDGPU_HSA_METADATA_KIND_INIT_RWIMG = 4 +} amdgpu_hsa_metadata_kind_t; + +// AMD GPU Sampler Coordinate Normalization Enumeration Values. +typedef uint8_t amdgpu_hsa_sampler_coord8_t; +typedef enum { + AMDGPU_HSA_SAMPLER_COORD_UNNORMALIZED = 0, + AMDGPU_HSA_SAMPLER_COORD_NORMALIZED = 1 +} amdgpu_hsa_sampler_coord_t; + +// AMD GPU Sampler Filter Enumeration Values. +typedef uint8_t amdgpu_hsa_sampler_filter8_t; +typedef enum { + AMDGPU_HSA_SAMPLER_FILTER_NEAREST = 0, + AMDGPU_HSA_SAMPLER_FILTER_LINEAR = 1 +} amdgpu_hsa_sampler_filter_t; + +// AMD GPU Sampler Addressing Enumeration Values. +typedef uint8_t amdgpu_hsa_sampler_addressing8_t; +typedef enum { + AMDGPU_HSA_SAMPLER_ADDRESSING_UNDEFINED = 0, + AMDGPU_HSA_SAMPLER_ADDRESSING_CLAMP_TO_EDGE = 1, + AMDGPU_HSA_SAMPLER_ADDRESSING_CLAMP_TO_BORDER = 2, + AMDGPU_HSA_SAMPLER_ADDRESSING_REPEAT = 3, + AMDGPU_HSA_SAMPLER_ADDRESSING_MIRRORED_REPEAT = 4 +} amdgpu_hsa_sampler_addressing_t; + +// AMD GPU Sampler Descriptor. +typedef struct amdgpu_hsa_sampler_descriptor_s { + uint16_t size; + amdgpu_hsa_metadata_kind16_t kind; + amdgpu_hsa_sampler_coord8_t coord; + amdgpu_hsa_sampler_filter8_t filter; + amdgpu_hsa_sampler_addressing8_t addressing; + uint8_t reserved1; +} amdgpu_hsa_sampler_descriptor_t; + +// AMD GPU Image Geometry Enumeration Values. +typedef uint8_t amdgpu_hsa_image_geometry8_t; +typedef enum { + AMDGPU_HSA_IMAGE_GEOMETRY_1D = 0, + AMDGPU_HSA_IMAGE_GEOMETRY_2D = 1, + AMDGPU_HSA_IMAGE_GEOMETRY_3D = 2, + AMDGPU_HSA_IMAGE_GEOMETRY_1DA = 3, + AMDGPU_HSA_IMAGE_GEOMETRY_2DA = 4, + AMDGPU_HSA_IMAGE_GEOMETRY_1DB = 5, + AMDGPU_HSA_IMAGE_GEOMETRY_2DDEPTH = 6, + AMDGPU_HSA_IMAGE_GEOMETRY_2DADEPTH = 7 +} amdgpu_hsa_image_geometry_t; + +// AMD GPU Image Channel Order Enumeration Values. +typedef uint8_t amdgpu_hsa_image_channel_order8_t; +typedef enum { + AMDGPU_HSA_IMAGE_CHANNEL_ORDER_A = 0, + AMDGPU_HSA_IMAGE_CHANNEL_ORDER_R = 1, + AMDGPU_HSA_IMAGE_CHANNEL_ORDER_RX = 2, + AMDGPU_HSA_IMAGE_CHANNEL_ORDER_RG = 3, + AMDGPU_HSA_IMAGE_CHANNEL_ORDER_RGX = 4, + AMDGPU_HSA_IMAGE_CHANNEL_ORDER_RA = 5, + AMDGPU_HSA_IMAGE_CHANNEL_ORDER_RGB = 6, + AMDGPU_HSA_IMAGE_CHANNEL_ORDER_RGBX = 7, + AMDGPU_HSA_IMAGE_CHANNEL_ORDER_RGBA = 8, + AMDGPU_HSA_IMAGE_CHANNEL_ORDER_BGRA = 9, + AMDGPU_HSA_IMAGE_CHANNEL_ORDER_ARGB = 10, + AMDGPU_HSA_IMAGE_CHANNEL_ORDER_ABGR = 11, + AMDGPU_HSA_IMAGE_CHANNEL_ORDER_SRGB = 12, + AMDGPU_HSA_IMAGE_CHANNEL_ORDER_SRGBX = 13, + AMDGPU_HSA_IMAGE_CHANNEL_ORDER_SRGBA = 14, + AMDGPU_HSA_IMAGE_CHANNEL_ORDER_SBGRA = 15, + AMDGPU_HSA_IMAGE_CHANNEL_ORDER_INTENSITY = 16, + AMDGPU_HSA_IMAGE_CHANNEL_ORDER_LUMINANCE = 17, + AMDGPU_HSA_IMAGE_CHANNEL_ORDER_DEPTH = 18, + AMDGPU_HSA_IMAGE_CHANNEL_ORDER_DEPTH_STENCIL = 19 +} amdgpu_hsa_image_channel_order_t; + +// AMD GPU Image Channel Type Enumeration Values. +typedef uint8_t amdgpu_hsa_image_channel_type8_t; +typedef enum { + AMDGPU_HSA_IMAGE_CHANNEL_TYPE_SNORM_INT8 = 0, + AMDGPU_HSA_IMAGE_CHANNEL_TYPE_SNORM_INT16 = 1, + AMDGPU_HSA_IMAGE_CHANNEL_TYPE_UNORM_INT8 = 2, + AMDGPU_HSA_IMAGE_CHANNEL_TYPE_UNORM_INT16 = 3, + AMDGPU_HSA_IMAGE_CHANNEL_TYPE_UNORM_INT24 = 4, + AMDGPU_HSA_IMAGE_CHANNEL_TYPE_SHORT_555 = 5, + AMDGPU_HSA_IMAGE_CHANNEL_TYPE_SHORT_565 = 6, + AMDGPU_HSA_IMAGE_CHANNEL_TYPE_INT_101010 = 7, + AMDGPU_HSA_IMAGE_CHANNEL_TYPE_SIGNED_INT8 = 8, + AMDGPU_HSA_IMAGE_CHANNEL_TYPE_SIGNED_INT16 = 9, + AMDGPU_HSA_IMAGE_CHANNEL_TYPE_SIGNED_INT32 = 10, + AMDGPU_HSA_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8 = 11, + AMDGPU_HSA_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16 = 12, + AMDGPU_HSA_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32 = 13, + AMDGPU_HSA_IMAGE_CHANNEL_TYPE_HALF_FLOAT = 14, + AMDGPU_HSA_IMAGE_CHANNEL_TYPE_FLOAT = 15 +} amdgpu_hsa_image_channel_type_t; + +// AMD GPU Image Descriptor. +typedef struct amdgpu_hsa_image_descriptor_s { + uint16_t size; + amdgpu_hsa_metadata_kind16_t kind; + amdgpu_hsa_image_geometry8_t geometry; + amdgpu_hsa_image_channel_order8_t channel_order; + amdgpu_hsa_image_channel_type8_t channel_type; + uint8_t reserved1; + uint64_t width; + uint64_t height; + uint64_t depth; + uint64_t array; +} amdgpu_hsa_image_descriptor_t; + +typedef struct amdgpu_hsa_note_code_object_version_s { + uint32_t major_version; + uint32_t minor_version; +} amdgpu_hsa_note_code_object_version_t; + +typedef struct amdgpu_hsa_note_hsail_s { + uint32_t hsail_major_version; + uint32_t hsail_minor_version; + uint8_t profile; + uint8_t machine_model; + uint8_t default_float_round; +} amdgpu_hsa_note_hsail_t; + +typedef struct amdgpu_hsa_note_isa_s { + uint16_t vendor_name_size; + uint16_t architecture_name_size; + uint32_t major; + uint32_t minor; + uint32_t stepping; + char vendor_and_architecture_name[1]; +} amdgpu_hsa_note_isa_t; + +typedef struct amdgpu_hsa_note_producer_s { + uint16_t producer_name_size; + uint16_t reserved; + uint32_t producer_major_version; + uint32_t producer_minor_version; + char producer_name[1]; +} amdgpu_hsa_note_producer_t; + +typedef struct amdgpu_hsa_note_producer_options_s { + uint16_t producer_options_size; + char producer_options[1]; +} amdgpu_hsa_note_producer_options_t; + +typedef enum { + AMDGPU_HSA_RODATA_GLOBAL_PROGRAM = 0, + AMDGPU_HSA_RODATA_GLOBAL_AGENT, + AMDGPU_HSA_RODATA_READONLY_AGENT, + AMDGPU_HSA_DATA_GLOBAL_PROGRAM, + AMDGPU_HSA_DATA_GLOBAL_AGENT, + AMDGPU_HSA_DATA_READONLY_AGENT, + AMDGPU_HSA_BSS_GLOBAL_PROGRAM, + AMDGPU_HSA_BSS_GLOBAL_AGENT, + AMDGPU_HSA_BSS_READONLY_AGENT, + AMDGPU_HSA_SECTION_LAST, +} amdgpu_hsa_elf_section_t; + +#endif // AMD_HSA_ELF_H diff --git a/runtime/hsa-runtime/inc/amd_hsa_kernel_code.h b/runtime/hsa-runtime/inc/amd_hsa_kernel_code.h new file mode 100644 index 0000000000..12f096b432 --- /dev/null +++ b/runtime/hsa-runtime/inc/amd_hsa_kernel_code.h @@ -0,0 +1,271 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef AMD_HSA_KERNEL_CODE_H +#define AMD_HSA_KERNEL_CODE_H + +#include "amd_hsa_common.h" +#include "hsa.h" + +// AMD Kernel Code Version Enumeration Values. +typedef uint32_t amd_kernel_code_version32_t; +enum amd_kernel_code_version_t { + AMD_KERNEL_CODE_VERSION_MAJOR = 1, + AMD_KERNEL_CODE_VERSION_MINOR = 1 +}; + +// AMD Machine Kind Enumeration Values. +typedef uint16_t amd_machine_kind16_t; +enum amd_machine_kind_t { + AMD_MACHINE_KIND_UNDEFINED = 0, + AMD_MACHINE_KIND_AMDGPU = 1 +}; + +// AMD Machine Version. +typedef uint16_t amd_machine_version16_t; + +// AMD Float Round Mode Enumeration Values. +enum amd_float_round_mode_t { + AMD_FLOAT_ROUND_MODE_NEAREST_EVEN = 0, + AMD_FLOAT_ROUND_MODE_PLUS_INFINITY = 1, + AMD_FLOAT_ROUND_MODE_MINUS_INFINITY = 2, + AMD_FLOAT_ROUND_MODE_ZERO = 3 +}; + +// AMD Float Denorm Mode Enumeration Values. +enum amd_float_denorm_mode_t { + AMD_FLOAT_DENORM_MODE_FLUSH_SOURCE_OUTPUT = 0, + AMD_FLOAT_DENORM_MODE_FLUSH_OUTPUT = 1, + AMD_FLOAT_DENORM_MODE_FLUSH_SOURCE = 2, + AMD_FLOAT_DENORM_MODE_NO_FLUSH = 3 +}; + +// AMD Compute Program Resource Register One. +typedef uint32_t amd_compute_pgm_rsrc_one32_t; +enum amd_compute_pgm_rsrc_one_t { + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WORKITEM_VGPR_COUNT, 0, 6), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WAVEFRONT_SGPR_COUNT, 6, 4), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_PRIORITY, 10, 2), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_ROUND_MODE_32, 12, 2), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_ROUND_MODE_16_64, 14, 2), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_DENORM_MODE_32, 16, 2), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_DENORM_MODE_16_64, 18, 2), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_PRIV, 20, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_ENABLE_DX10_CLAMP, 21, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_DEBUG_MODE, 22, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_ENABLE_IEEE_MODE, 23, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_BULKY, 24, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_CDBG_USER, 25, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_RESERVED1, 26, 6) +}; + +// AMD System VGPR Workitem ID Enumeration Values. +enum amd_system_vgpr_workitem_id_t { + AMD_SYSTEM_VGPR_WORKITEM_ID_X = 0, + AMD_SYSTEM_VGPR_WORKITEM_ID_X_Y = 1, + AMD_SYSTEM_VGPR_WORKITEM_ID_X_Y_Z = 2, + AMD_SYSTEM_VGPR_WORKITEM_ID_UNDEFINED = 3 +}; + +// AMD Compute Program Resource Register Two. +typedef uint32_t amd_compute_pgm_rsrc_two32_t; +enum amd_compute_pgm_rsrc_two_t { + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_PRIVATE_SEGMENT_WAVE_BYTE_OFFSET, 0, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_USER_SGPR_COUNT, 1, 5), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_TRAP_HANDLER, 6, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_X, 7, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_Y, 8, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_Z, 9, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_INFO, 10, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_VGPR_WORKITEM_ID, 11, 2), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_ADDRESS_WATCH, 13, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_MEMORY_VIOLATION, 14, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_GRANULATED_LDS_SIZE, 15, 9), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_INVALID_OPERATION, 24, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_FP_DENORMAL_SOURCE, 25, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_DIVISION_BY_ZERO, 26, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_OVERFLOW, 27, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_UNDERFLOW, 28, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_INEXACT, 29, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_INT_DIVISION_BY_ZERO, 30, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_RESERVED1, 31, 1) +}; + +// AMD Element Byte Size Enumeration Values. +enum amd_element_byte_size_t { + AMD_ELEMENT_BYTE_SIZE_2 = 0, + AMD_ELEMENT_BYTE_SIZE_4 = 1, + AMD_ELEMENT_BYTE_SIZE_8 = 2, + AMD_ELEMENT_BYTE_SIZE_16 = 3 +}; + +// AMD Kernel Code Properties. +typedef uint32_t amd_kernel_code_properties32_t; +enum amd_kernel_code_properties_t { + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER, 0, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_DISPATCH_PTR, 1, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_QUEUE_PTR, 2, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_KERNARG_SEGMENT_PTR, 3, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_DISPATCH_ID, 4, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_FLAT_SCRATCH_INIT, 5, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE, 6, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X, 7, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y, 8, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z, 9, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_RESERVED1, 10, 6), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_ORDERED_APPEND_GDS, 16, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_PRIVATE_ELEMENT_SIZE, 17, 2), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_IS_PTR64, 19, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_IS_DYNAMIC_CALLSTACK, 20, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_IS_DEBUG_ENABLED, 21, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_IS_XNACK_ENABLED, 22, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_RESERVED2, 23, 9) +}; + +// AMD Power Of Two Enumeration Values. +typedef uint8_t amd_powertwo8_t; +enum amd_powertwo_t { + AMD_POWERTWO_1 = 0, + AMD_POWERTWO_2 = 1, + AMD_POWERTWO_4 = 2, + AMD_POWERTWO_8 = 3, + AMD_POWERTWO_16 = 4, + AMD_POWERTWO_32 = 5, + AMD_POWERTWO_64 = 6, + AMD_POWERTWO_128 = 7, + AMD_POWERTWO_256 = 8 +}; + +// AMD Enabled Control Directive Enumeration Values. +typedef uint64_t amd_enabled_control_directive64_t; +enum amd_enabled_control_directive_t { + AMD_ENABLED_CONTROL_DIRECTIVE_ENABLE_BREAK_EXCEPTIONS = 1, + AMD_ENABLED_CONTROL_DIRECTIVE_ENABLE_DETECT_EXCEPTIONS = 2, + AMD_ENABLED_CONTROL_DIRECTIVE_MAX_DYNAMIC_GROUP_SIZE = 4, + AMD_ENABLED_CONTROL_DIRECTIVE_MAX_FLAT_GRID_SIZE = 8, + AMD_ENABLED_CONTROL_DIRECTIVE_MAX_FLAT_WORKGROUP_SIZE = 16, + AMD_ENABLED_CONTROL_DIRECTIVE_REQUIRED_DIM = 32, + AMD_ENABLED_CONTROL_DIRECTIVE_REQUIRED_GRID_SIZE = 64, + AMD_ENABLED_CONTROL_DIRECTIVE_REQUIRED_WORKGROUP_SIZE = 128, + AMD_ENABLED_CONTROL_DIRECTIVE_REQUIRE_NO_PARTIAL_WORKGROUPS = 256 +}; + +// AMD Exception Kind Enumeration Values. +typedef uint16_t amd_exception_kind16_t; +enum amd_exception_kind_t { + AMD_EXCEPTION_KIND_INVALID_OPERATION = 1, + AMD_EXCEPTION_KIND_DIVISION_BY_ZERO = 2, + AMD_EXCEPTION_KIND_OVERFLOW = 4, + AMD_EXCEPTION_KIND_UNDERFLOW = 8, + AMD_EXCEPTION_KIND_INEXACT = 16 +}; + +// AMD Control Directives. +#define AMD_CONTROL_DIRECTIVES_ALIGN_BYTES 64 +#define AMD_CONTROL_DIRECTIVES_ALIGN __ALIGNED__(AMD_CONTROL_DIRECTIVES_ALIGN_BYTES) +typedef AMD_CONTROL_DIRECTIVES_ALIGN struct amd_control_directives_s { + amd_enabled_control_directive64_t enabled_control_directives; + uint16_t enable_break_exceptions; + uint16_t enable_detect_exceptions; + uint32_t max_dynamic_group_size; + uint64_t max_flat_grid_size; + uint32_t max_flat_workgroup_size; + uint8_t required_dim; + uint8_t reserved1[3]; + uint64_t required_grid_size[3]; + uint32_t required_workgroup_size[3]; + uint8_t reserved2[60]; +} amd_control_directives_t; + +// AMD Kernel Code. +#define AMD_ISA_ALIGN_BYTES 256 +#define AMD_KERNEL_CODE_ALIGN_BYTES 64 +#define AMD_KERNEL_CODE_ALIGN __ALIGNED__(AMD_KERNEL_CODE_ALIGN_BYTES) +typedef AMD_KERNEL_CODE_ALIGN struct amd_kernel_code_s { + amd_kernel_code_version32_t amd_kernel_code_version_major; + amd_kernel_code_version32_t amd_kernel_code_version_minor; + amd_machine_kind16_t amd_machine_kind; + amd_machine_version16_t amd_machine_version_major; + amd_machine_version16_t amd_machine_version_minor; + amd_machine_version16_t amd_machine_version_stepping; + int64_t kernel_code_entry_byte_offset; + int64_t kernel_code_prefetch_byte_offset; + uint64_t kernel_code_prefetch_byte_size; + uint64_t max_scratch_backing_memory_byte_size; + amd_compute_pgm_rsrc_one32_t compute_pgm_rsrc1; + amd_compute_pgm_rsrc_two32_t compute_pgm_rsrc2; + amd_kernel_code_properties32_t kernel_code_properties; + uint32_t workitem_private_segment_byte_size; + uint32_t workgroup_group_segment_byte_size; + uint32_t gds_segment_byte_size; + uint64_t kernarg_segment_byte_size; + uint32_t workgroup_fbarrier_count; + uint16_t wavefront_sgpr_count; + uint16_t workitem_vgpr_count; + uint16_t reserved_vgpr_first; + uint16_t reserved_vgpr_count; + uint16_t reserved_sgpr_first; + uint16_t reserved_sgpr_count; + uint16_t debug_wavefront_private_segment_offset_sgpr; + uint16_t debug_private_segment_buffer_sgpr; + amd_powertwo8_t kernarg_segment_alignment; + amd_powertwo8_t group_segment_alignment; + amd_powertwo8_t private_segment_alignment; + amd_powertwo8_t wavefront_size; + int32_t call_convention; + uint8_t reserved1[12]; + uint64_t runtime_loader_kernel_symbol; + amd_control_directives_t control_directives; +} amd_kernel_code_t; + +// TODO: this struct should be completely gone once debugger designs/implements +// Debugger APIs. +typedef struct amd_runtime_loader_debug_info_s { + const void* elf_raw; + size_t elf_size; + const char *kernel_name; + const void *owning_segment; + hsa_profile_t profile; + uint64_t gpuva; +} amd_runtime_loader_debug_info_t; + +#endif // AMD_HSA_KERNEL_CODE_H diff --git a/runtime/hsa-runtime/inc/amd_hsa_queue.h b/runtime/hsa-runtime/inc/amd_hsa_queue.h new file mode 100644 index 0000000000..b37bb53f36 --- /dev/null +++ b/runtime/hsa-runtime/inc/amd_hsa_queue.h @@ -0,0 +1,86 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef AMD_HSA_QUEUE_H +#define AMD_HSA_QUEUE_H + +#include "amd_hsa_common.h" +#include "hsa.h" + +// AMD Queue Properties. +typedef uint32_t amd_queue_properties32_t; +enum amd_queue_properties_t { + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_QUEUE_PROPERTIES_ENABLE_TRAP_HANDLER, 0, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_QUEUE_PROPERTIES_IS_PTR64, 1, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_QUEUE_PROPERTIES_ENABLE_TRAP_HANDLER_DEBUG_SGPRS, 2, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_QUEUE_PROPERTIES_ENABLE_PROFILING, 3, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_QUEUE_PROPERTIES_RESERVED1, 4, 28) +}; + +// AMD Queue. +#define AMD_QUEUE_ALIGN_BYTES 64 +#define AMD_QUEUE_ALIGN __ALIGNED__(AMD_QUEUE_ALIGN_BYTES) +typedef struct AMD_QUEUE_ALIGN amd_queue_s { + hsa_queue_t hsa_queue; + uint32_t reserved1[4]; + volatile uint64_t write_dispatch_id; + uint32_t group_segment_aperture_base_hi; + uint32_t private_segment_aperture_base_hi; + uint32_t max_cu_id; + uint32_t max_wave_id; + volatile uint64_t max_legacy_doorbell_dispatch_id_plus_1; + volatile uint32_t legacy_doorbell_lock; + uint32_t reserved2[9]; + volatile uint64_t read_dispatch_id; + uint32_t read_dispatch_id_field_base_byte_offset; + uint32_t compute_tmpring_size; + uint32_t scratch_resource_descriptor[4]; + uint64_t scratch_backing_memory_location; + uint64_t scratch_backing_memory_byte_size; + uint32_t scratch_workitem_byte_size; + amd_queue_properties32_t queue_properties; + uint32_t reserved3[2]; + hsa_signal_t queue_inactive_signal; + uint32_t reserved4[14]; +} amd_queue_t; + +#endif // AMD_HSA_QUEUE_H diff --git a/runtime/hsa-runtime/inc/amd_hsa_signal.h b/runtime/hsa-runtime/inc/amd_hsa_signal.h new file mode 100644 index 0000000000..deefc8f025 --- /dev/null +++ b/runtime/hsa-runtime/inc/amd_hsa_signal.h @@ -0,0 +1,80 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef AMD_HSA_SIGNAL_H +#define AMD_HSA_SIGNAL_H + +#include "amd_hsa_common.h" +#include "amd_hsa_queue.h" + +// AMD Signal Kind Enumeration Values. +typedef int64_t amd_signal_kind64_t; +enum amd_signal_kind_t { + AMD_SIGNAL_KIND_INVALID = 0, + AMD_SIGNAL_KIND_USER = 1, + AMD_SIGNAL_KIND_DOORBELL = -1, + AMD_SIGNAL_KIND_LEGACY_DOORBELL = -2 +}; + +// AMD Signal. +#define AMD_SIGNAL_ALIGN_BYTES 64 +#define AMD_SIGNAL_ALIGN __ALIGNED__(AMD_SIGNAL_ALIGN_BYTES) +typedef struct AMD_SIGNAL_ALIGN amd_signal_s { + amd_signal_kind64_t kind; + union { + volatile int64_t value; + volatile uint32_t* legacy_hardware_doorbell_ptr; + volatile uint64_t* hardware_doorbell_ptr; + }; + uint64_t event_mailbox_ptr; + uint32_t event_id; + uint32_t reserved1; + uint64_t start_ts; + uint64_t end_ts; + union { + amd_queue_t* queue_ptr; + uint64_t reserved2; + }; + uint32_t reserved3[2]; +} amd_signal_t; + +#endif // AMD_HSA_SIGNAL_H diff --git a/runtime/hsa-runtime/inc/hsa.h b/runtime/hsa-runtime/inc/hsa.h new file mode 100644 index 0000000000..159ef07d24 --- /dev/null +++ b/runtime/hsa-runtime/inc/hsa.h @@ -0,0 +1,3728 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef HSA_RUNTIME_INC_HSA_H_ +#define HSA_RUNTIME_INC_HSA_H_ + +#include /* size_t */ +#include /* uintXX_t */ +#ifndef __cplusplus +#include +#endif /* __cplusplus */ + +// Placeholder for calling convention and import/export macros +#ifndef HSA_CALL +#define HSA_CALL +#endif + +#ifndef HSA_EXPORT_DECORATOR +#ifdef __GNUC__ +#define HSA_EXPORT_DECORATOR __attribute__ ((visibility ("default"))) +#else +#define HSA_EXPORT_DECORATOR +#endif +#endif + +#define HSA_API_EXPORT HSA_EXPORT_DECORATOR HSA_CALL +#define HSA_API_IMPORT HSA_CALL + +#if !defined(HSA_API) && defined(HSA_EXPORT) +#define HSA_API HSA_API_EXPORT +#else +#define HSA_API HSA_API_IMPORT +#endif + +// Detect and set large model builds. +#undef HSA_LARGE_MODEL +#if defined(__LP64__) || defined(_M_X64) +#define HSA_LARGE_MODEL +#endif + +// Try to detect CPU endianness +#if !defined(LITTLEENDIAN_CPU) && !defined(BIGENDIAN_CPU) +#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || \ + defined(_M_X64) +#define LITTLEENDIAN_CPU +#endif +#endif + +#undef HSA_LITTLE_ENDIAN +#if defined(LITTLEENDIAN_CPU) +#define HSA_LITTLE_ENDIAN +#elif defined(BIGENDIAN_CPU) +#else +#error "BIGENDIAN_CPU or LITTLEENDIAN_CPU must be defined" +#endif + +#ifdef __cplusplus +extern "C" { +#endif /* __cplusplus */ + +/** \defgroup status Runtime Notifications + * @{ + */ + +/** + * @brief Status codes. + */ +typedef enum { + /** + * The function has been executed successfully. + */ + HSA_STATUS_SUCCESS = 0x0, + /** + * A traversal over a list of elements has been interrupted by the + * application before completing. + */ + HSA_STATUS_INFO_BREAK = 0x1, + /** + * A generic error has occurred. + */ + HSA_STATUS_ERROR = 0x1000, + /** + * One of the actual arguments does not meet a precondition stated in the + * documentation of the corresponding formal argument. + */ + HSA_STATUS_ERROR_INVALID_ARGUMENT = 0x1001, + /** + * The requested queue creation is not valid. + */ + HSA_STATUS_ERROR_INVALID_QUEUE_CREATION = 0x1002, + /** + * The requested allocation is not valid. + */ + HSA_STATUS_ERROR_INVALID_ALLOCATION = 0x1003, + /** + * The agent is invalid. + */ + HSA_STATUS_ERROR_INVALID_AGENT = 0x1004, + /** + * The memory region is invalid. + */ + HSA_STATUS_ERROR_INVALID_REGION = 0x1005, + /** + * The signal is invalid. + */ + HSA_STATUS_ERROR_INVALID_SIGNAL = 0x1006, + /** + * The queue is invalid. + */ + HSA_STATUS_ERROR_INVALID_QUEUE = 0x1007, + /** + * The HSA runtime failed to allocate the necessary resources. This error + * may also occur when the HSA runtime needs to spawn threads or create + * internal OS-specific events. + */ + HSA_STATUS_ERROR_OUT_OF_RESOURCES = 0x1008, + /** + * The AQL packet is malformed. + */ + HSA_STATUS_ERROR_INVALID_PACKET_FORMAT = 0x1009, + /** + * An error has been detected while releasing a resource. + */ + HSA_STATUS_ERROR_RESOURCE_FREE = 0x100A, + /** + * An API other than ::hsa_init has been invoked while the reference count + * of the HSA runtime is 0. + */ + HSA_STATUS_ERROR_NOT_INITIALIZED = 0x100B, + /** + * The maximum reference count for the object has been reached. + */ + HSA_STATUS_ERROR_REFCOUNT_OVERFLOW = 0x100C, + /** + * The arguments passed to a functions are not compatible. + */ + HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS = 0x100D, + /** + * The index is invalid. + */ + HSA_STATUS_ERROR_INVALID_INDEX = 0x100E, + /** + * The instruction set architecture is invalid. + */ + HSA_STATUS_ERROR_INVALID_ISA = 0x100F, + /** + * The instruction set architecture name is invalid. + */ + HSA_STATUS_ERROR_INVALID_ISA_NAME = 0x1017, + /** + * The code object is invalid. + */ + HSA_STATUS_ERROR_INVALID_CODE_OBJECT = 0x1010, + /** + * The executable is invalid. + */ + HSA_STATUS_ERROR_INVALID_EXECUTABLE = 0x1011, + /** + * The executable is frozen. + */ + HSA_STATUS_ERROR_FROZEN_EXECUTABLE = 0x1012, + /** + * There is no symbol with the given name. + */ + HSA_STATUS_ERROR_INVALID_SYMBOL_NAME = 0x1013, + /** + * The variable is already defined. + */ + HSA_STATUS_ERROR_VARIABLE_ALREADY_DEFINED = 0x1014, + /** + * The variable is undefined. + */ + HSA_STATUS_ERROR_VARIABLE_UNDEFINED = 0x1015, + /** + * An HSAIL operation resulted on a hardware exception. + */ + HSA_STATUS_ERROR_EXCEPTION = 0x1016 +} hsa_status_t; + +/** + * @brief Query additional information about a status code. + * + * @param[in] status Status code. + * + * @param[out] status_string A NUL-terminated string that describes the error + * status. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p status is an invalid + * status code, or @p status_string is NULL. + */ +hsa_status_t HSA_API + hsa_status_string(hsa_status_t status, const char **status_string); + +/** @} */ + +/** \defgroup common Common Definitions + * @{ + */ + +/** + * @brief Three-dimensional coordinate. + */ +typedef struct hsa_dim3_s { + /** + * X dimension. + */ + uint32_t x; + + /** + * Y dimension. + */ + uint32_t y; + + /** + * Z dimension. + */ + uint32_t z; +} hsa_dim3_t; + +/** + * @brief Access permissions. + */ +typedef enum { + /** + * Read-only access. + */ + HSA_ACCESS_PERMISSION_RO = 1, + /** + * Write-only access. + */ + HSA_ACCESS_PERMISSION_WO = 2, + /** + * Read and write access. + */ + HSA_ACCESS_PERMISSION_RW = 3 +} hsa_access_permission_t; + +/** @} **/ + +/** \defgroup initshutdown Initialization and Shut Down + * @{ + */ + +/** + * @brief Initialize the HSA runtime. + * + * @details Initializes the HSA runtime if it is not already initialized, and + * increases the reference counter associated with the HSA runtime for the + * current process. Invocation of any HSA function other than ::hsa_init results + * in undefined behavior if the current HSA runtime reference counter is less + * than one. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES There is failure to allocate + * the resources required by the implementation. + * + * @retval ::HSA_STATUS_ERROR_REFCOUNT_OVERFLOW The HSA runtime reference + * count reaches INT32_MAX. + */ +hsa_status_t HSA_API hsa_init(); + +/** + * @brief Shut down the HSA runtime. + * + * @details Decreases the reference count of the HSA runtime instance. When the + * reference count reaches 0, the HSA runtime is no longer considered valid + * but the application might call ::hsa_init to initialize the HSA runtime + * again. + * + * Once the reference count of the HSA runtime reaches 0, all the resources + * associated with it (queues, signals, agent information, etc.) are + * considered invalid and any attempt to reference them in subsequent API calls + * results in undefined behavior. When the reference count reaches 0, the HSA + * runtime may release resources associated with it. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + */ +hsa_status_t HSA_API hsa_shut_down(); + +/** @} **/ + +/** \defgroup agentinfo System and Agent Information + * @{ + */ + +/** + * @brief Endianness. A convention used to interpret the bytes making up a data + * word. + */ +typedef enum { + /** + * The least significant byte is stored in the smallest address. + */ + HSA_ENDIANNESS_LITTLE = 0, + /** + * The most significant byte is stored in the smallest address. + */ + HSA_ENDIANNESS_BIG = 1 +} hsa_endianness_t; + +/** + * @brief Machine model. A machine model determines the size of certain data + * types in HSA runtime and an agent. + */ +typedef enum { + /** + * Small machine model. Addresses use 32 bits. + */ + HSA_MACHINE_MODEL_SMALL = 0, + /** + * Large machine model. Addresses use 64 bits. + */ + HSA_MACHINE_MODEL_LARGE = 1 +} hsa_machine_model_t; + +/** + * @brief Profile. A profile indicates a particular level of feature + * support. For example, in the base profile the application must use the HSA + * runtime allocator to reserve Shared Virtual Memory, while in the full profile + * any host pointer can be shared across all the agents. + */ +typedef enum { + /** + * Base profile. + */ + HSA_PROFILE_BASE = 0, + /** + * Full profile. + */ + HSA_PROFILE_FULL = 1 +} hsa_profile_t; + +/** + * @brief System attributes. + */ +typedef enum { + /** + * Major version of the HSA runtime specification supported by the + * implementation. The type of this attribute is uint16_t. + */ + HSA_SYSTEM_INFO_VERSION_MAJOR = 0, + /** + * Minor version of the HSA runtime specification supported by the + * implementation. The type of this attribute is uint16_t. + */ + HSA_SYSTEM_INFO_VERSION_MINOR = 1, + /** + * Current timestamp. The value of this attribute monotonically increases at a + * constant rate. The type of this attribute is uint64_t. + */ + HSA_SYSTEM_INFO_TIMESTAMP = 2, + /** + * Timestamp value increase rate, in Hz. The timestamp (clock) frequency is + * in the range 1-400MHz. The type of this attribute is uint64_t. + */ + HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY = 3, + /** + * Maximum duration of a signal wait operation. Expressed as a count based on + * the timestamp frequency. The type of this attribute is uint64_t. + */ + HSA_SYSTEM_INFO_SIGNAL_MAX_WAIT = 4, + /** + * Endianness of the system. The type of this attribute us ::hsa_endianness_t. + */ + HSA_SYSTEM_INFO_ENDIANNESS = 5, + /** + * Machine model supported by the HSA runtime. The type of this attribute is + * ::hsa_machine_model_t. + */ + HSA_SYSTEM_INFO_MACHINE_MODEL = 6, + /** + * Bit-mask indicating which extensions are supported by the + * implementation. An extension with an ID of @p i is supported if the bit at + * position @p i is set. The type of this attribute is uint8_t[128]. + */ + HSA_SYSTEM_INFO_EXTENSIONS = 7 +} hsa_system_info_t; + +/** + * @brief Get the current value of a system attribute. + * + * @param[in] attribute Attribute to query. + * + * @param[out] value Pointer to an application-allocated buffer where to store + * the value of the attribute. If the buffer passed by the application is not + * large enough to hold the value of @p attribute, the behavior is undefined. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid + * system attribute, or @p value is NULL. + */ +hsa_status_t HSA_API + hsa_system_get_info(hsa_system_info_t attribute, void *value); + +/** + * @brief HSA extensions. + */ +typedef enum { + /** + * Finalizer extension. + */ + HSA_EXTENSION_FINALIZER = 0, + /** + * Images extension. + */ + HSA_EXTENSION_IMAGES = 1, + HSA_EXTENSION_AMD_PROFILER = 2 +} hsa_extension_t; + +/** + * @brief Query if a given version of an extension is supported by the HSA + * implementation. + * + * @param[in] extension Extension identifier. + * + * @param[in] version_major Major version number. + * + * @param[in] version_minor Minor version number. + * + * @param[out] result Pointer to a memory location where the HSA runtime stores + * the result of the check. The result is true if the specified version of the + * extension is supported, and false otherwise. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p extension is not a valid + * extension, or @p result is NULL. + */ +hsa_status_t HSA_API + hsa_system_extension_supported(uint16_t extension, uint16_t version_major, + uint16_t version_minor, bool *result); + +/** + * @brief Retrieve the function pointers corresponding to a given version of an + * extension. Portable applications are expected to invoke the extension API + * using the returned function pointers + * + * @details The application is responsible for verifying that the given version + * of the extension is supported by the HSA implementation (see + * ::hsa_system_extension_supported). If the given combination of extension, + * major version, and minor version is not supported by the implementation, the + * behavior is undefined. + * + * @param[in] extension Extension identifier. + * + * @param[in] version_major Major version number for which to retrieve the + * function pointer table. + * + * @param[in] version_minor Minor version number for which to retrieve the + * function pointer table. + * + * @param[out] table Pointer to an application-allocated function pointer table + * that is populated by the HSA runtime. Must not be NULL. The memory associated + * with table can be reused or freed after the function returns. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p extension is not a valid + * extension, or @p table is NULL. + */ +hsa_status_t HSA_API + hsa_system_get_extension_table(uint16_t extension, uint16_t version_major, + uint16_t version_minor, void *table); + +/** + * @brief Opaque handle representing an agent, a device that participates in + * the HSA memory model. An agent can submit AQL packets for execution, and + * may also accept AQL packets for execution (agent dispatch packets or kernel + * dispatch packets launching HSAIL-derived binaries). + */ +typedef struct hsa_agent_s { + /** + * Opaque handle. + */ + uint64_t handle; +} hsa_agent_t; + +/** + * @brief Agent features. + */ +typedef enum { + /** + * The agent supports AQL packets of kernel dispatch type. If this + * feature is enabled, the agent is also a kernel agent. + */ + HSA_AGENT_FEATURE_KERNEL_DISPATCH = 1, + /** + * The agent supports AQL packets of agent dispatch type. + */ + HSA_AGENT_FEATURE_AGENT_DISPATCH = 2 +} hsa_agent_feature_t; + +/** + * @brief Hardware device type. + */ +typedef enum { + /** + * CPU device. + */ + HSA_DEVICE_TYPE_CPU = 0, + /** + * GPU device. + */ + HSA_DEVICE_TYPE_GPU = 1, + /** + * DSP device. + */ + HSA_DEVICE_TYPE_DSP = 2 +} hsa_device_type_t; + +/** + * @brief Default floating-point rounding mode. + */ +typedef enum { + /** + * Use a default floating-point rounding mode specified elsewhere. + */ + HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT = 0, + /** + * Operations that specify the default floating-point mode are rounded to zero + * by default. + */ + HSA_DEFAULT_FLOAT_ROUNDING_MODE_ZERO = 1, + /** + * Operations that specify the default floating-point mode are rounded to the + * nearest representable number and that ties should be broken by selecting + * the value with an even least significant bit. + */ + HSA_DEFAULT_FLOAT_ROUNDING_MODE_NEAR = 2 +} hsa_default_float_rounding_mode_t; + +/** + * @brief Agent attributes. + */ +typedef enum { + /** + * Agent name. The type of this attribute is a NUL-terminated char[64]. If + * the name of the agent uses less than 63 characters, the rest of the + * array must be filled with NULs. + */ + HSA_AGENT_INFO_NAME = 0, + /** + * Name of vendor. The type of this attribute is a NUL-terminated char[64]. If + * the name of the vendor uses less than 63 characters, the rest of the array + * must be filled with NULs. + */ + HSA_AGENT_INFO_VENDOR_NAME = 1, + /** + * Agent capability. The type of this attribute is ::hsa_agent_feature_t. + */ + HSA_AGENT_INFO_FEATURE = 2, + /** + * Machine model supported by the agent. The type of this attribute is + * ::hsa_machine_model_t. + */ + HSA_AGENT_INFO_MACHINE_MODEL = 3, + /** + * Profile supported by the agent. The type of this attribute is + * ::hsa_profile_t. + */ + HSA_AGENT_INFO_PROFILE = 4, + /** + * Default floating-point rounding mode. The type of this attribute is + * ::hsa_default_float_rounding_mode_t, but the value + * ::HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT is not allowed. + */ + HSA_AGENT_INFO_DEFAULT_FLOAT_ROUNDING_MODE = 5, + /** + * Default floating-point rounding modes supported by the agent in the Base + * profile. The type of this attribute is a mask of + * ::hsa_default_float_rounding_mode_t. The default floating-point rounding + * mode (::HSA_AGENT_INFO_DEFAULT_FLOAT_ROUNDING_MODE) bit must not be set. + */ + HSA_AGENT_INFO_BASE_PROFILE_DEFAULT_FLOAT_ROUNDING_MODES = 23, + /** + * Flag indicating that the f16 HSAIL operation is at least as fast as the + * f32 operation in the current agent. The value of this attribute is + * undefined if the agent is not a kernel agent. The type of this + * attribute is bool. + */ + HSA_AGENT_INFO_FAST_F16_OPERATION = 24, + /** + * Number of work-items in a wavefront. Must be a power of 2 in the range + * [1,256]. The value of this attribute is undefined if the agent is not + * a kernel agent. The type of this attribute is uint32_t. + */ + HSA_AGENT_INFO_WAVEFRONT_SIZE = 6, + /** + * Maximum number of work-items of each dimension of a work-group. Each + * maximum must be greater than 0. No maximum can exceed the value of + * ::HSA_AGENT_INFO_WORKGROUP_MAX_SIZE. The value of this attribute is + * undefined if the agent is not a kernel agent. The type of this + * attribute is uint16_t[3]. + */ + HSA_AGENT_INFO_WORKGROUP_MAX_DIM = 7, + /** + * Maximum total number of work-items in a work-group. The value of this + * attribute is undefined if the agent is not a kernel agent. The type + * of this attribute is uint32_t. + */ + HSA_AGENT_INFO_WORKGROUP_MAX_SIZE = 8, + /** + * Maximum number of work-items of each dimension of a grid. Each maximum must + * be greater than 0, and must not be smaller than the corresponding value in + * ::HSA_AGENT_INFO_WORKGROUP_MAX_DIM. No maximum can exceed the value of + * ::HSA_AGENT_INFO_GRID_MAX_SIZE. The value of this attribute is undefined if + * the agent is not a kernel agent. The type of this attribute is + * ::hsa_dim3_t. + */ + HSA_AGENT_INFO_GRID_MAX_DIM = 9, + /** + * Maximum total number of work-items in a grid. The value of this attribute + * is undefined if the agent is not a kernel agent. The type of this + * attribute is uint32_t. + */ + HSA_AGENT_INFO_GRID_MAX_SIZE = 10, + /** + * Maximum number of fbarriers per work-group. Must be at least 32. The value + * of this attribute is undefined if the agent is not a kernel agent. The + * type of this attribute is uint32_t. + */ + HSA_AGENT_INFO_FBARRIER_MAX_SIZE = 11, + /** + * Maximum number of queues that can be active (created but not destroyed) at + * one time in the agent. The type of this attribute is uint32_t. + */ + HSA_AGENT_INFO_QUEUES_MAX = 12, + /** + * Minimum number of packets that a queue created in the agent + * can hold. Must be a power of 2 greater than 0. Must not exceed + * the value of ::HSA_AGENT_INFO_QUEUE_MAX_SIZE. The type of this + * attribute is uint32_t. + */ + HSA_AGENT_INFO_QUEUE_MIN_SIZE = 13, + /** + * Maximum number of packets that a queue created in the agent can + * hold. Must be a power of 2 greater than 0. The type of this attribute + * is uint32_t. + */ + HSA_AGENT_INFO_QUEUE_MAX_SIZE = 14, + /** + * Type of a queue created in the agent. The type of this attribute is + * ::hsa_queue_type_t. + */ + HSA_AGENT_INFO_QUEUE_TYPE = 15, + /** + * Identifier of the NUMA node associated with the agent. The type of this + * attribute is uint32_t. + */ + HSA_AGENT_INFO_NODE = 16, + /** + * Type of hardware device associated with the agent. The type of this + * attribute is ::hsa_device_type_t. + */ + HSA_AGENT_INFO_DEVICE = 17, + /** + * Array of data cache sizes (L1..L4). Each size is expressed in bytes. A size + * of 0 for a particular level indicates that there is no cache information + * for that level. The type of this attribute is uint32_t[4]. + */ + HSA_AGENT_INFO_CACHE_SIZE = 18, + /** + * Instruction set architecture of the agent. The type of this attribute + * is ::hsa_isa_t. + */ + HSA_AGENT_INFO_ISA = 19, + /** + * Bit-mask indicating which extensions are supported by the agent. An + * extension with an ID of @p i is supported if the bit at position @p i is + * set. The type of this attribute is uint8_t[128]. + */ + HSA_AGENT_INFO_EXTENSIONS = 20, + /** + * Major version of the HSA runtime specification supported by the + * agent. The type of this attribute is uint16_t. + */ + HSA_AGENT_INFO_VERSION_MAJOR = 21, + /** + * Minor version of the HSA runtime specification supported by the + * agent. The type of this attribute is uint16_t. + */ + HSA_AGENT_INFO_VERSION_MINOR = 22 +} hsa_agent_info_t; + +/** + * @brief Get the current value of an attribute for a given agent. + * + * @param[in] agent A valid agent. + * + * @param[in] attribute Attribute to query. + * + * @param[out] value Pointer to an application-allocated buffer where to store + * the value of the attribute. If the buffer passed by the application is not + * large enough to hold the value of @p attribute, the behavior is undefined. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid + * agent attribute, or @p value is NULL. + */ +hsa_status_t HSA_API hsa_agent_get_info(hsa_agent_t agent, + hsa_agent_info_t attribute, + void *value); + +/** + * @brief Iterate over the available agents, and invoke an + * application-defined callback on every iteration. + * + * @param[in] callback Callback to be invoked once per agent. The HSA + * runtime passes two arguments to the callback, the agent and the + * application data. If @p callback returns a status other than + * ::HSA_STATUS_SUCCESS for a particular iteration, the traversal stops and + * ::hsa_iterate_agents returns that status value. + * + * @param[in] data Application data that is passed to @p callback on every + * iteration. May be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL. + */ +hsa_status_t HSA_API + hsa_iterate_agents(hsa_status_t (*callback)(hsa_agent_t agent, void *data), + void *data); + +/* + +// If we do not know the size of an attribute, we need to query it first +// Note: this API will not be in the spec unless needed +hsa_status_t HSA_API hsa_agent_get_info_size( + hsa_agent_t agent, + hsa_agent_info_t attribute, + size_t* size); + +// Set the value of an agents attribute +// Note: this API will not be in the spec unless needed +hsa_status_t HSA_API hsa_agent_set_info( + hsa_agent_t agent, + hsa_agent_info_t attribute, + void* value); + +*/ + +/** + * @brief Exception policies applied in the presence of hardware exceptions. + */ +typedef enum { + /** + * If a hardware exception is detected, a work-item signals an exception. + */ + HSA_EXCEPTION_POLICY_BREAK = 1, + /** + * If a hardware exception is detected, a hardware status bit is set. + */ + HSA_EXCEPTION_POLICY_DETECT = 2 +} hsa_exception_policy_t; + +/** + * @brief Retrieve the exception policy support for a given combination of + * agent and profile + * + * @param[in] agent Agent. + * + * @param[in] profile Profile. + * + * @param[out] mask Pointer to a memory location where the HSA runtime stores a + * mask of ::hsa_exception_policy_t values. Must not be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p profile is not a valid + * profile, or @p mask is NULL. + * + */ +hsa_status_t HSA_API hsa_agent_get_exception_policies(hsa_agent_t agent, + hsa_profile_t profile, + uint16_t *mask); + +/** + * @brief Query if a given version of an extension is supported by an agent + * + * @param[in] extension Extension identifier. + * + * @param[in] agent Agent. + * + * @param[in] version_major Major version number. + * + * @param[in] version_minor Minor version number. + * + * @param[out] result Pointer to a memory location where the HSA runtime stores + * the result of the check. The result is true if the specified version of the + * extension is supported, and false otherwise. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p extension is not a valid + * extension, or @p result is NULL. + */ +hsa_status_t HSA_API + hsa_agent_extension_supported(uint16_t extension, hsa_agent_t agent, + uint16_t version_major, + uint16_t version_minor, bool *result); + +/** @} */ + +/** \defgroup signals Signals + * @{ + */ + +/** + * @brief Signal handle. + */ +typedef struct hsa_signal_s { + /** + * Opaque handle. The value 0 is reserved. + */ + uint64_t handle; +} hsa_signal_t; + +/** + * @brief Signal value. The value occupies 32 bits in small machine mode, and 64 + * bits in large machine mode. + */ +#ifdef HSA_LARGE_MODEL +typedef int64_t hsa_signal_value_t; +#else +typedef int32_t hsa_signal_value_t; +#endif + +/** + * @brief Create a signal. + * + * @param[in] initial_value Initial value of the signal. + * + * @param[in] num_consumers Size of @p consumers. A value of 0 indicates that + * any agent might wait on the signal. + * + * @param[in] consumers List of agents that might consume (wait on) the + * signal. If @p num_consumers is 0, this argument is ignored; otherwise, the + * HSA runtime might use the list to optimize the handling of the signal + * object. If an agent not listed in @p consumers waits on the returned + * signal, the behavior is undefined. The memory associated with @p consumers + * can be reused or freed after the function returns. + * + * @param[out] signal Pointer to a memory location where the HSA runtime will + * store the newly created signal handle. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES There is failure to allocate the + * resources required by the implementation. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p signal is NULL, @p + * num_consumers is greater than 0 but @p consumers is NULL, or @p consumers + * contains duplicates. + */ +hsa_status_t HSA_API + hsa_signal_create(hsa_signal_value_t initial_value, uint32_t num_consumers, + const hsa_agent_t *consumers, hsa_signal_t *signal); + +/** + * @brief Destroy a signal previous created by ::hsa_signal_create. + * + * @param[in] signal Signal. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_SIGNAL @p signal is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT The handle in @p signal is 0. + */ +hsa_status_t HSA_API hsa_signal_destroy(hsa_signal_t signal); + +/** + * @brief Atomically read the current value of a signal. + * + * @param[in] signal Signal. + * + * @return Value of the signal. + */ +hsa_signal_value_t HSA_API hsa_signal_load_acquire(hsa_signal_t signal); + +/** + * @copydoc hsa_signal_load_acquire + */ +hsa_signal_value_t HSA_API hsa_signal_load_relaxed(hsa_signal_t signal); + +/** + * @brief Atomically set the value of a signal. + * + * @details If the value of the signal is changed, all the agents waiting + * on @p signal for which @p value satisfies their wait condition are awakened. + * + * @param[in] signal Signal. + * + * @param[in] value New signal value. + */ +void HSA_API + hsa_signal_store_relaxed(hsa_signal_t signal, hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_store_relaxed + */ +void HSA_API + hsa_signal_store_release(hsa_signal_t signal, hsa_signal_value_t value); + +/** + * @brief Atomically set the value of a signal and return its previous value. + * + * @details If the value of the signal is changed, all the agents waiting + * on @p signal for which @p value satisfies their wait condition are awakened. + * + * @param[in] signal Signal. If @p signal is a queue doorbell signal, the + * behavior is undefined. + * + * @param[in] value New value. + * + * @return Value of the signal prior to the exchange. + * + */ +hsa_signal_value_t HSA_API + hsa_signal_exchange_acq_rel(hsa_signal_t signal, hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_exchange_acq_rel + */ +hsa_signal_value_t HSA_API + hsa_signal_exchange_acquire(hsa_signal_t signal, hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_exchange_acq_rel + */ +hsa_signal_value_t HSA_API + hsa_signal_exchange_relaxed(hsa_signal_t signal, hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_exchange_acq_rel + */ +hsa_signal_value_t HSA_API + hsa_signal_exchange_release(hsa_signal_t signal, hsa_signal_value_t value); + +/** + * @brief Atomically set the value of a signal if the observed value is equal to + * the expected value. The observed value is returned regardless of whether the + * replacement was done. + * + * @details If the value of the signal is changed, all the agents waiting + * on @p signal for which @p value satisfies their wait condition are awakened. + * + * @param[in] signal Signal. If @p signal is a queue + * doorbell signal, the behavior is undefined. + * + * @param[in] expected Value to compare with. + * + * @param[in] value New value. + * + * @return Observed value of the signal. + * + */ +hsa_signal_value_t HSA_API hsa_signal_cas_acq_rel(hsa_signal_t signal, + hsa_signal_value_t expected, + hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_cas_acq_rel + */ +hsa_signal_value_t HSA_API hsa_signal_cas_acquire(hsa_signal_t signal, + hsa_signal_value_t expected, + hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_cas_acq_rel + */ +hsa_signal_value_t HSA_API hsa_signal_cas_relaxed(hsa_signal_t signal, + hsa_signal_value_t expected, + hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_cas_acq_rel + */ +hsa_signal_value_t HSA_API hsa_signal_cas_release(hsa_signal_t signal, + hsa_signal_value_t expected, + hsa_signal_value_t value); + +/** + * @brief Atomically increment the value of a signal by a given amount. + * + * @details If the value of the signal is changed, all the agents waiting on + * @p signal for which @p value satisfies their wait condition are awakened. + * + * @param[in] signal Signal. If @p signal is a queue doorbell signal, the + * behavior is undefined. + * + * @param[in] value Value to add to the value of the signal. + * + */ +void HSA_API + hsa_signal_add_acq_rel(hsa_signal_t signal, hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_add_acq_rel + */ +void HSA_API + hsa_signal_add_acquire(hsa_signal_t signal, hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_add_acq_rel + */ +void HSA_API + hsa_signal_add_relaxed(hsa_signal_t signal, hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_add_acq_rel + */ +void HSA_API + hsa_signal_add_release(hsa_signal_t signal, hsa_signal_value_t value); + +/** + * @brief Atomically decrement the value of a signal by a given amount. + * + * @details If the value of the signal is changed, all the agents waiting on + * @p signal for which @p value satisfies their wait condition are awakened. + * + * @param[in] signal Signal. If @p signal is a queue doorbell signal, the + * behavior is undefined. + * + * @param[in] value Value to subtract from the value of the signal. + * + */ +void HSA_API + hsa_signal_subtract_acq_rel(hsa_signal_t signal, hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_subtract_acq_rel + */ +void HSA_API + hsa_signal_subtract_acquire(hsa_signal_t signal, hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_subtract_acq_rel + */ +void HSA_API + hsa_signal_subtract_relaxed(hsa_signal_t signal, hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_subtract_acq_rel + */ +void HSA_API + hsa_signal_subtract_release(hsa_signal_t signal, hsa_signal_value_t value); + +/** + * @brief Atomically perform a bitwise AND operation between the value of a + * signal and a given value. + * + * @details If the value of the signal is changed, all the agents waiting on + * @p signal for which @p value satisfies their wait condition are awakened. + * + * @param[in] signal Signal. If @p signal is a queue doorbell signal, the + * behavior is undefined. + * + * @param[in] value Value to AND with the value of the signal. + * + */ +void HSA_API + hsa_signal_and_acq_rel(hsa_signal_t signal, hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_and_acq_rel + */ +void HSA_API + hsa_signal_and_acquire(hsa_signal_t signal, hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_and_acq_rel + */ +void HSA_API + hsa_signal_and_relaxed(hsa_signal_t signal, hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_and_acq_rel + */ +void HSA_API + hsa_signal_and_release(hsa_signal_t signal, hsa_signal_value_t value); + +/** + * @brief Atomically perform a bitwise OR operation between the value of a + * signal and a given value. + * + * @details If the value of the signal is changed, all the agents waiting on + * @p signal for which @p value satisfies their wait condition are awakened. + * + * @param[in] signal Signal. If @p signal is a queue doorbell signal, the + * behavior is undefined. + * + * @param[in] value Value to OR with the value of the signal. + */ +void HSA_API + hsa_signal_or_acq_rel(hsa_signal_t signal, hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_or_acq_rel + */ +void HSA_API + hsa_signal_or_acquire(hsa_signal_t signal, hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_or_acq_rel + */ +void HSA_API + hsa_signal_or_relaxed(hsa_signal_t signal, hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_or_acq_rel + */ +void HSA_API + hsa_signal_or_release(hsa_signal_t signal, hsa_signal_value_t value); + +/** + * @brief Atomically perform a bitwise XOR operation between the value of a + * signal and a given value. + * + * @details If the value of the signal is changed, all the agents waiting on + * @p signal for which @p value satisfies their wait condition are awakened. + * + * @param[in] signal Signal. If @p signal is a queue doorbell signal, the + * behavior is undefined. + * + * @param[in] value Value to XOR with the value of the signal. + * + */ +void HSA_API + hsa_signal_xor_acq_rel(hsa_signal_t signal, hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_xor_acq_rel + */ +void HSA_API + hsa_signal_xor_acquire(hsa_signal_t signal, hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_xor_acq_rel + */ +void HSA_API + hsa_signal_xor_relaxed(hsa_signal_t signal, hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_xor_acq_rel + */ +void HSA_API + hsa_signal_xor_release(hsa_signal_t signal, hsa_signal_value_t value); + +/** + * @brief Wait condition operator. + */ +typedef enum { + /** + * The two operands are equal. + */ + HSA_SIGNAL_CONDITION_EQ = 0, + /** + * The two operands are not equal. + */ + HSA_SIGNAL_CONDITION_NE = 1, + /** + * The first operand is less than the second operand. + */ + HSA_SIGNAL_CONDITION_LT = 2, + /** + * The first operand is greater than or equal to the second operand. + */ + HSA_SIGNAL_CONDITION_GTE = 3 +} hsa_signal_condition_t; + +/** + * @brief State of the application thread during a signal wait. + */ +typedef enum { + /** + * The application thread may be rescheduled while waiting on the signal. + */ + HSA_WAIT_STATE_BLOCKED = 0, + /** + * The application thread stays active while waiting on a signal. + */ + HSA_WAIT_STATE_ACTIVE = 1 +} hsa_wait_state_t; + +/** + * @brief Wait until a signal value satisfies a specified condition, or a + * certain amount of time has elapsed. + * + * @details A wait operation can spuriously resume at any time sooner than the + * timeout (for example, due to system or other external factors) even when the + * condition has not been met. + * + * The function is guaranteed to return if the signal value satisfies the + * condition at some point in time during the wait, but the value returned to + * the application might not satisfy the condition. The application must ensure + * that signals are used in such way that wait wakeup conditions are not + * invalidated before dependent threads have woken up. + * + * When the wait operation internally loads the value of the passed signal, it + * uses the memory order indicated in the function name. + * + * @param[in] signal Signal. + * + * @param[in] condition Condition used to compare the signal value with @p + * compare_value. + * + * @param[in] compare_value Value to compare with. + * + * @param[in] timeout_hint Maximum duration of the wait. Specified in the same + * unit as the system timestamp. The operation might block for a shorter or + * longer time even if the condition is not met. A value of UINT64_MAX indicates + * no maximum. + * + * @param[in] wait_state_hint Hint used by the application to indicate the + * preferred waiting state. The actual waiting state is ultimately decided by + * HSA runtime and may not match the provided hint. A value of + * ::HSA_WAIT_STATE_ACTIVE may improve the latency of response to a signal + * update by avoiding rescheduling overhead. + * + * @return Observed value of the signal, which might not satisfy the specified + * condition. + * + */ +hsa_signal_value_t HSA_API + hsa_signal_wait_acquire(hsa_signal_t signal, + hsa_signal_condition_t condition, + hsa_signal_value_t compare_value, + uint64_t timeout_hint, + hsa_wait_state_t wait_state_hint); + +/** + * @copydoc hsa_signal_wait_acquire + */ +hsa_signal_value_t HSA_API + hsa_signal_wait_relaxed(hsa_signal_t signal, + hsa_signal_condition_t condition, + hsa_signal_value_t compare_value, + uint64_t timeout_hint, + hsa_wait_state_t wait_state_hint); + +/** @} */ + +/** \defgroup memory Memory + * @{ + */ + +/** + * @brief A memory region represents a block of virtual memory with certain + * properties. For example, the HSA runtime represents fine-grained memory in + * the global segment using a region. A region might be associated with more + * than one agent. + */ +typedef struct hsa_region_s { + /** + * Opaque handle. + */ + uint64_t handle; +} hsa_region_t; + +/** @} */ + +/** \defgroup queue Queues + * @{ + */ + +/** + * @brief Queue type. Intended to be used for dynamic queue protocol + * determination. + */ +typedef enum { + /** + * Queue supports multiple producers. + */ + HSA_QUEUE_TYPE_MULTI = 0, + /** + * Queue only supports a single producer. + */ + HSA_QUEUE_TYPE_SINGLE = 1 +} hsa_queue_type_t; + +/** + * @brief Queue features. + */ +typedef enum { + /** + * Queue supports kernel dispatch packets. + */ + HSA_QUEUE_FEATURE_KERNEL_DISPATCH = 1, + + /** + * Queue supports agent dispatch packets. + */ + HSA_QUEUE_FEATURE_AGENT_DISPATCH = 2 +} hsa_queue_feature_t; + +/** + * @brief User mode queue. + * + * @details The queue structure is read-only and allocated by the HSA runtime, + * but agents can directly modify the contents of the buffer pointed by @a + * base_address, or use HSA runtime APIs to access the doorbell signal. + * + */ +typedef struct hsa_queue_s { + /** + * Queue type. + */ + hsa_queue_type_t type; + + /** + * Queue features mask. This is a bit-field of ::hsa_queue_feature_t + * values. Applications should ignore any unknown set bits. + */ + uint32_t features; + +#ifdef HSA_LARGE_MODEL + void *base_address; +#elif defined HSA_LITTLE_ENDIAN + /** + * Starting address of the HSA runtime-allocated buffer used to store the AQL + * packets. Must be aligned to the size of an AQL packet. + */ + void *base_address; + /** + * Reserved. Must be 0. + */ + uint32_t reserved0; +#else + uint32_t reserved0; + void *base_address; +#endif + + /** + * Signal object used by the application to indicate the ID of a packet that + * is ready to be processed. The HSA runtime manages the doorbell signal. If + * the application tries to replace or destroy this signal, the behavior is + * undefined. + * + * If @a type is ::HSA_QUEUE_TYPE_SINGLE the doorbell signal value must be + * updated in a monotonically increasing fashion. If @a type is + * ::HSA_QUEUE_TYPE_MULTI, the doorbell signal value can be updated with any + * value. + */ + hsa_signal_t doorbell_signal; + + /** + * Maximum number of packets the queue can hold. Must be a power of 2. + */ + uint32_t size; + /** + * Reserved. Must be 0. + */ + uint32_t reserved1; + /** + * Queue identifier, which is unique over the lifetime of the application. + */ + uint64_t id; + +} hsa_queue_t; + +/** + * @brief Create a user mode queue. + * + * @details The HSA runtime creates the queue structure, the underlying packet + * buffer, the completion signal, and the write and read indexes. The initial + * value of the write and read indexes is 0. The type of every packet in the + * buffer is initialized to ::HSA_PACKET_TYPE_INVALID. + * + * The application should only rely on the error code returned to determine if + * the queue is valid. + * + * @param[in] agent Agent where to create the queue. + * + * @param[in] size Number of packets the queue is expected to + * hold. Must be a power of 2 between 1 and the value of + * ::HSA_AGENT_INFO_QUEUE_MAX_SIZE in @p agent. The size of the newly + * created queue is the maximum of @p size and the value of + * ::HSA_AGENT_INFO_QUEUE_MIN_SIZE in @p agent. + * + * @param[in] type Type of the queue. If the value of + * ::HSA_AGENT_INFO_QUEUE_TYPE in @p agent is ::HSA_QUEUE_TYPE_SINGLE, then @p + * type must also be ::HSA_QUEUE_TYPE_SINGLE. + * + * @param[in] callback Callback invoked by the HSA runtime for every + * asynchronous event related to the newly created queue. May be NULL. The HSA + * runtime passes three arguments to the callback: a code identifying the event + * that triggered the invocation, a pointer to the queue where the event + * originated, and the application data. + * + * @param[in] data Application data that is passed to @p callback on every + * iteration. May be NULL. + * + * @param[in] private_segment_size Hint indicating the maximum + * expected private segment usage per work-item, in bytes. There may + * be performance degradation if the application places a kernel + * dispatch packet in the queue and the corresponding private segment + * usage exceeds @p private_segment_size. If the application does not + * want to specify any particular value for this argument, @p + * private_segment_size must be UINT32_MAX. If the queue does not + * support kernel dispatch packets, this argument is ignored. + * + * @param[in] group_segment_size Hint indicating the maximum expected + * group segment usage per work-group, in bytes. There may be + * performance degradation if the application places a kernel dispatch + * packet in the queue and the corresponding group segment usage + * exceeds @p group_segment_size. If the application does not want to + * specify any particular value for this argument, @p + * group_segment_size must be UINT32_MAX. If the queue does not + * support kernel dispatch packets, this argument is ignored. + * + * @param[out] queue Memory location where the HSA runtime stores a pointer to + * the newly created queue. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES There is failure to allocate + * the resources required by the implementation. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_QUEUE_CREATION @p agent does not + * support queues of the given type. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p size is not a power of two, + * @p size is 0, @p type is an invalid queue type, or @p queue is NULL. + * + */ +hsa_status_t HSA_API + hsa_queue_create(hsa_agent_t agent, uint32_t size, hsa_queue_type_t type, + void (*callback)(hsa_status_t status, hsa_queue_t *source, + void *data), + void *data, uint32_t private_segment_size, + uint32_t group_segment_size, hsa_queue_t **queue); + +/** + * @brief Create a queue for which the application or a kernel is responsible + * for processing the AQL packets. + * + * @details The application can use this function to create queues where AQL + * packets are not parsed by the packet processor associated with an agent, + * but rather by a unit of execution running on that agent (for example, a + * thread in the host application). + * + * The application is responsible for ensuring that all the producers and + * consumers of the resulting queue can access the provided doorbell signal + * and memory region. The application is also responsible for ensuring that the + * unit of execution processing the queue packets supports the indicated + * features (AQL packet types). + * + * When the queue is created, the HSA runtime allocates the packet buffer using + * @p region, and the write and read indexes. The initial value of the write and + * read indexes is 0, and the type of every packet in the buffer is initialized + * to ::HSA_PACKET_TYPE_INVALID. The value of the @e size, @e type, @e features, + * and @e doorbell_signal fields in the returned queue match the values passed + * by the application. + * + * @param[in] region Memory region that the HSA runtime should use to allocate + * the AQL packet buffer and any other queue metadata. + * + * @param[in] size Number of packets the queue is expected to hold. Must be a + * power of 2 greater than 0. + * + * @param[in] type Queue type. + * + * @param[in] features Supported queue features. This is a bit-field of + * ::hsa_queue_feature_t values. + * + * @param[in] doorbell_signal Doorbell signal that the HSA runtime must + * associate with the returned queue. The signal handle must not be 0. + * + * @param[out] queue Memory location where the HSA runtime stores a pointer to + * the newly created queue. The application should not rely on the value + * returned for this argument but only in the status code to determine if the + * queue is valid. Must not be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES There is failure to allocate + * the resources required by the implementation. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p size is not a power of two, @p + * size is 0, @p type is an invalid queue type, the doorbell signal handle is + * 0, or @p queue is NULL. + * + */ +hsa_status_t HSA_API + hsa_soft_queue_create(hsa_region_t region, uint32_t size, + hsa_queue_type_t type, uint32_t features, + hsa_signal_t doorbell_signal, hsa_queue_t **queue); + +/** + * @brief Destroy a user mode queue. + * + * @details When a queue is destroyed, the state of the AQL packets that have + * not been yet fully processed (their completion phase has not finished) + * becomes undefined. It is the responsibility of the application to ensure that + * all pending queue operations are finished if their results are required. + * + * The resources allocated by the HSA runtime during queue creation (queue + * structure, ring buffer, doorbell signal) are released. The queue should not + * be accessed after being destroyed. + * + * @param[in] queue Pointer to a queue created using ::hsa_queue_create. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_QUEUE The queue is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p queue is NULL. + */ +hsa_status_t HSA_API hsa_queue_destroy(hsa_queue_t *queue); + +/** + * @brief Inactivate a queue. + * + * @details Inactivating the queue aborts any pending executions and prevent any + * new packets from being processed. Any more packets written to the queue once + * it is inactivated will be ignored by the packet processor. + * + * @param[in] queue Pointer to a queue. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_QUEUE The queue is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p queue is NULL. + */ +hsa_status_t HSA_API hsa_queue_inactivate(hsa_queue_t *queue); + +/** + * @brief Atomically load the read index of a queue. + * + * @param[in] queue Pointer to a queue. + * + * @return Read index of the queue pointed by @p queue. + */ +uint64_t HSA_API hsa_queue_load_read_index_acquire(const hsa_queue_t *queue); + +/** + * @copydoc hsa_queue_load_read_index_acquire + */ +uint64_t HSA_API hsa_queue_load_read_index_relaxed(const hsa_queue_t *queue); + +/** + * @brief Atomically load the write index of a queue. + * + * @param[in] queue Pointer to a queue. + * + * @return Write index of the queue pointed by @p queue. + */ +uint64_t HSA_API hsa_queue_load_write_index_acquire(const hsa_queue_t *queue); + +/** + * @copydoc hsa_queue_load_write_index_acquire + */ +uint64_t HSA_API hsa_queue_load_write_index_relaxed(const hsa_queue_t *queue); + +/** + * @brief Atomically set the write index of a queue. + * + * @param[in] queue Pointer to a queue. + * + * @param[in] value Value to assign to the write index. + * + */ +void HSA_API hsa_queue_store_write_index_relaxed(const hsa_queue_t *queue, + uint64_t value); + +/** + * @copydoc hsa_queue_store_write_index_relaxed + */ +void HSA_API hsa_queue_store_write_index_release(const hsa_queue_t *queue, + uint64_t value); + +/** + * @brief Atomically set the write index of a queue if the observed value is + * equal to the expected value. The application can inspect the returned value + * to determine if the replacement was done. + * + * @param[in] queue Pointer to a queue. + * + * @param[in] expected Expected value. + * + * @param[in] value Value to assign to the write index if @p expected matches + * the observed write index. Must be greater than @p expected. + * + * @return Previous value of the write index. + */ +uint64_t HSA_API hsa_queue_cas_write_index_acq_rel(const hsa_queue_t *queue, + uint64_t expected, + uint64_t value); + +/** + * @copydoc hsa_queue_cas_write_index_acq_rel + */ +uint64_t HSA_API hsa_queue_cas_write_index_acquire(const hsa_queue_t *queue, + uint64_t expected, + uint64_t value); + +/** + * @copydoc hsa_queue_cas_write_index_acq_rel + */ +uint64_t HSA_API hsa_queue_cas_write_index_relaxed(const hsa_queue_t *queue, + uint64_t expected, + uint64_t value); + +/** + * @copydoc hsa_queue_cas_write_index_acq_rel + */ +uint64_t HSA_API hsa_queue_cas_write_index_release(const hsa_queue_t *queue, + uint64_t expected, + uint64_t value); + +/** + * @brief Atomically increment the write index of a queue by an offset. + * + * @param[in] queue Pointer to a queue. + * + * @param[in] value Value to add to the write index. + * + * @return Previous value of the write index. + */ +uint64_t HSA_API + hsa_queue_add_write_index_acq_rel(const hsa_queue_t *queue, uint64_t value); + +/** + * @copydoc hsa_queue_add_write_index_acq_rel + */ +uint64_t HSA_API + hsa_queue_add_write_index_acquire(const hsa_queue_t *queue, uint64_t value); + +/** + * @copydoc hsa_queue_add_write_index_acq_rel + */ +uint64_t HSA_API + hsa_queue_add_write_index_relaxed(const hsa_queue_t *queue, uint64_t value); + +/** + * @copydoc hsa_queue_add_write_index_acq_rel + */ +uint64_t HSA_API + hsa_queue_add_write_index_release(const hsa_queue_t *queue, uint64_t value); + +/** + * @brief Atomically set the read index of a queue. + * + * @details Modifications of the read index are not allowed and result in + * undefined behavior if the queue is associated with an agent for which + * only the corresponding packet processor is permitted to update the read + * index. + * + * @param[in] queue Pointer to a queue. + * + * @param[in] value Value to assign to the read index. + * + */ +void HSA_API hsa_queue_store_read_index_relaxed(const hsa_queue_t *queue, + uint64_t value); + +/** + * @copydoc hsa_queue_store_read_index_relaxed + */ +void HSA_API hsa_queue_store_read_index_release(const hsa_queue_t *queue, + uint64_t value); +/** @} */ + +/** \defgroup aql Architected Queuing Language + * @{ + */ + +/** + * @brief Packet type. + */ +typedef enum { + /** + * Vendor-specific packet. + */ + HSA_PACKET_TYPE_VENDOR_SPECIFIC = 0, + /** + * The packet has been processed in the past, but has not been reassigned to + * the packet processor. A packet processor must not process a packet of this + * type. All queues support this packet type. + */ + HSA_PACKET_TYPE_INVALID = 1, + /** + * Packet used by agents for dispatching jobs to kernel agents. Not all + * queues support packets of this type (see ::hsa_queue_feature_t). + */ + HSA_PACKET_TYPE_KERNEL_DISPATCH = 2, + /** + * Packet used by agents to delay processing of subsequent packets, and to + * express complex dependencies between multiple packets. All queues support + * this packet type. + */ + HSA_PACKET_TYPE_BARRIER_AND = 3, + /** + * Packet used by agents for dispatching jobs to agents. Not all + * queues support packets of this type (see ::hsa_queue_feature_t). + */ + HSA_PACKET_TYPE_AGENT_DISPATCH = 4, + /** + * Packet used by agents to delay processing of subsequent packets, and to + * express complex dependencies between multiple packets. All queues support + * this packet type. + */ + HSA_PACKET_TYPE_BARRIER_OR = 5 +} hsa_packet_type_t; + +/** + * @brief Scope of the memory fence operation associated with a packet. + */ +typedef enum { + /** + * No scope (no fence is applied). The packet relies on external fences to + * ensure visibility of memory updates. + */ + HSA_FENCE_SCOPE_NONE = 0, + /** + * The fence is applied with agent scope for the global segment. + */ + HSA_FENCE_SCOPE_AGENT = 1, + /** + * The fence is applied across both agent and system scope for the global + * segment. + */ + HSA_FENCE_SCOPE_SYSTEM = 2 +} hsa_fence_scope_t; + +/** + * @brief Sub-fields of the @a header field that is present in any AQL + * packet. The offset (with respect to the address of @a header) of a sub-field + * is identical to its enumeration constant. The width of each sub-field is + * determined by the corresponding value in ::hsa_packet_header_width_t. The + * offset and the width are expressed in bits. + */ +typedef enum { + /** + * Packet type. The value of this sub-field must be one of + * ::hsa_packet_type_t. If the type is ::HSA_PACKET_TYPE_VENDOR_SPECIFIC, the + * packet layout is vendor-specific. + */ + HSA_PACKET_HEADER_TYPE = 0, + /** + * Barrier bit. If the barrier bit is set, the processing of the current + * packet only launches when all preceding packets (within the same queue) are + * complete. + */ + HSA_PACKET_HEADER_BARRIER = 8, + /** + * Acquire fence scope. The value of this sub-field determines the scope and + * type of the memory fence operation applied before the packet enters the + * active phase. An acquire fence ensures that any subsequent global segment + * or image loads by any unit of execution that belongs to a dispatch that has + * not yet entered the active phase on any queue of the same kernel agent, + * sees any data previously released at the scopes specified by the acquire + * fence. The value of this sub-field must be one of ::hsa_fence_scope_t. + */ + HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE = 9, + /** + * Release fence scope, The value of this sub-field determines the scope and + * type of the memory fence operation applied after kernel completion but + * before the packet is completed. A release fence makes any global segment or + * image data that was stored by any unit of execution that belonged to a + * dispatch that has completed the active phase on any queue of the same + * kernel agent visible in all the scopes specified by the release fence. The + * value of this sub-field must be one of ::hsa_fence_scope_t. + */ + HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE = 11 +} hsa_packet_header_t; + +/** + * @brief Width (in bits) of the sub-fields in ::hsa_packet_header_t. + */ +typedef enum { + HSA_PACKET_HEADER_WIDTH_TYPE = 8, + HSA_PACKET_HEADER_WIDTH_BARRIER = 1, + HSA_PACKET_HEADER_WIDTH_ACQUIRE_FENCE_SCOPE = 2, + HSA_PACKET_HEADER_WIDTH_RELEASE_FENCE_SCOPE = 2 +} hsa_packet_header_width_t; + +/** + * @brief Sub-fields of the kernel dispatch packet @a setup field. The offset + * (with respect to the address of @a setup) of a sub-field is identical to its + * enumeration constant. The width of each sub-field is determined by the + * corresponding value in ::hsa_kernel_dispatch_packet_setup_width_t. The + * offset and the width are expressed in bits. + */ +typedef enum { + /** + * Number of dimensions of the grid. Valid values are 1, 2, or 3. + * + */ + HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS = 0 +} hsa_kernel_dispatch_packet_setup_t; + +/** + * @brief Width (in bits) of the sub-fields in + * ::hsa_kernel_dispatch_packet_setup_t. + */ +typedef enum { + HSA_KERNEL_DISPATCH_PACKET_SETUP_WIDTH_DIMENSIONS = 2 +} hsa_kernel_dispatch_packet_setup_width_t; + +/** + * @brief AQL kernel dispatch packet + */ +typedef struct hsa_kernel_dispatch_packet_s { + /** + * Packet header. Used to configure multiple packet parameters such as the + * packet type. The parameters are described by ::hsa_packet_header_t. + */ + uint16_t header; + + /** + * Dispatch setup parameters. Used to configure kernel dispatch parameters + * such as the number of dimensions in the grid. The parameters are described + * by ::hsa_kernel_dispatch_packet_setup_t. + */ + uint16_t setup; + + /** + * X dimension of work-group, in work-items. Must be greater than 0. + */ + uint16_t workgroup_size_x; + + /** + * Y dimension of work-group, in work-items. Must be greater than + * 0. If the grid has 1 dimension, the only valid value is 1. + */ + uint16_t workgroup_size_y; + + /** + * Z dimension of work-group, in work-items. Must be greater than + * 0. If the grid has 1 or 2 dimensions, the only valid value is 1. + */ + uint16_t workgroup_size_z; + + /** + * Reserved. Must be 0. + */ + uint16_t reserved0; + + /** + * X dimension of grid, in work-items. Must be greater than 0. Must + * not be smaller than @a workgroup_size_x. + */ + uint32_t grid_size_x; + + /** + * Y dimension of grid, in work-items. Must be greater than 0. If the grid has + * 1 dimension, the only valid value is 1. Must not be smaller than @a + * workgroup_size_y. + */ + uint32_t grid_size_y; + + /** + * Z dimension of grid, in work-items. Must be greater than 0. If the grid has + * 1 or 2 dimensions, the only valid value is 1. Must not be smaller than @a + * workgroup_size_z. + */ + uint32_t grid_size_z; + + /** + * Size in bytes of private memory allocation request (per work-item). + */ + uint32_t private_segment_size; + + /** + * Size in bytes of group memory allocation request (per work-group). Must not + * be less than the sum of the group memory used by the kernel (and the + * functions it calls directly or indirectly) and the dynamically allocated + * group segment variables. + */ + uint32_t group_segment_size; + + /** + * Opaque handle to a code object that includes an implementation-defined + * executable code for the kernel. + */ + uint64_t kernel_object; + +#ifdef HSA_LARGE_MODEL + void *kernarg_address; +#elif defined HSA_LITTLE_ENDIAN + /** + * Pointer to a buffer containing the kernel arguments. May be NULL. + * + * The buffer must be allocated using ::hsa_memory_allocate, and must not be + * modified once the kernel dispatch packet is enqueued until the dispatch has + * completed execution. + */ + void *kernarg_address; + /** + * Reserved. Must be 0. + */ + uint32_t reserved1; +#else + uint32_t reserved1; + void *kernarg_address; +#endif + + /** + * Reserved. Must be 0. + */ + uint64_t reserved2; + + /** + * Signal used to indicate completion of the job. The application can use the + * special signal handle 0 to indicate that no signal is used. + */ + hsa_signal_t completion_signal; + +} hsa_kernel_dispatch_packet_t; + +/** + * @brief Agent dispatch packet. + */ +typedef struct hsa_agent_dispatch_packet_s { + /** + * Packet header. Used to configure multiple packet parameters such as the + * packet type. The parameters are described by ::hsa_packet_header_t. + */ + uint16_t header; + + /** + * Application-defined function to be performed by the destination agent. + */ + uint16_t type; + + /** + * Reserved. Must be 0. + */ + uint32_t reserved0; + +#ifdef HSA_LARGE_MODEL + void *return_address; +#elif defined HSA_LITTLE_ENDIAN + /** + * Address where to store the function return values, if any. + */ + void *return_address; + /** + * Reserved. Must be 0. + */ + uint32_t reserved1; +#else + uint32_t reserved1; + void *return_address; +#endif + + /** + * Function arguments. + */ + uint64_t arg[4]; + + /** + * Reserved. Must be 0. + */ + uint64_t reserved2; + + /** + * Signal used to indicate completion of the job. The application can use the + * special signal handle 0 to indicate that no signal is used. + */ + hsa_signal_t completion_signal; + +} hsa_agent_dispatch_packet_t; + +/** + * @brief Barrier-AND packet. + */ +typedef struct hsa_barrier_and_packet_s { + /** + * Packet header. Used to configure multiple packet parameters such as the + * packet type. The parameters are described by ::hsa_packet_header_t. + */ + uint16_t header; + + /** + * Reserved. Must be 0. + */ + uint16_t reserved0; + + /** + * Reserved. Must be 0. + */ + uint32_t reserved1; + + /** + * Array of dependent signal objects. Signals with a handle value of 0 are + * allowed and are interpreted by the packet processor as satisfied + * dependencies. + */ + hsa_signal_t dep_signal[5]; + + /** + * Reserved. Must be 0. + */ + uint64_t reserved2; + + /** + * Signal used to indicate completion of the job. The application can use the + * special signal handle 0 to indicate that no signal is used. + */ + hsa_signal_t completion_signal; + +} hsa_barrier_and_packet_t; + +/** + * @brief Barrier-OR packet. + */ +typedef struct hsa_barrier_or_packet_s { + /** + * Packet header. Used to configure multiple packet parameters such as the + * packet type. The parameters are described by ::hsa_packet_header_t. + */ + uint16_t header; + + /** + * Reserved. Must be 0. + */ + uint16_t reserved0; + + /** + * Reserved. Must be 0. + */ + uint32_t reserved1; + + /** + * Array of dependent signal objects. Signals with a handle value of 0 are + * allowed and are interpreted by the packet processor as dependencies not + * satisfied. + */ + hsa_signal_t dep_signal[5]; + + /** + * Reserved. Must be 0. + */ + uint64_t reserved2; + + /** + * Signal used to indicate completion of the job. The application can use the + * special signal handle 0 to indicate that no signal is used. + */ + hsa_signal_t completion_signal; + +} hsa_barrier_or_packet_t; + +/** @} */ + +/** \addtogroup memory Memory + * @{ + */ + +/** + * @brief Memory segments associated with a region. + */ +typedef enum { + /** + * Global segment. Used to hold data that is shared by all agents. + */ + HSA_REGION_SEGMENT_GLOBAL = 0, + /** + * Read-only segment. Used to hold data that remains constant during the + * execution of a kernel. + */ + HSA_REGION_SEGMENT_READONLY = 1, + /** + * Private segment. Used to hold data that is local to a single work-item. + */ + HSA_REGION_SEGMENT_PRIVATE = 2, + /** + * Group segment. Used to hold data that is shared by the work-items of a + * work-group. + */ + HSA_REGION_SEGMENT_GROUP = 3 +} hsa_region_segment_t; + +/** + * @brief Global region flags. + */ +typedef enum { + /** + * The application can use memory in the region to store kernel arguments, and + * provide the values for the kernarg segment of a kernel dispatch. If this + * flag is set, then ::HSA_REGION_GLOBAL_FLAG_FINE_GRAINED must be set. + */ + HSA_REGION_GLOBAL_FLAG_KERNARG = 1, + /** + * Updates to memory in this region are immediately visible to all the + * agents under the terms of the HSA memory model. If this + * flag is set, then ::HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED must not be set. + */ + HSA_REGION_GLOBAL_FLAG_FINE_GRAINED = 2, + /** + * Updates to memory in this region can be performed by a single agent at + * a time. If a different agent in the system is allowed to access the + * region, the application must explicitely invoke ::hsa_memory_assign_agent + * in order to transfer ownership to that agent for a particular buffer. + */ + HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED = 4 +} hsa_region_global_flag_t; + +/** + * @brief Attributes of a memory region. + */ +typedef enum { + /** + * Segment where memory in the region can be used. The type of this + * attribute is ::hsa_region_segment_t. + */ + HSA_REGION_INFO_SEGMENT = 0, + /** + * Flag mask. The value of this attribute is undefined if the value of + * ::HSA_REGION_INFO_SEGMENT is not ::HSA_REGION_SEGMENT_GLOBAL. The type of + * this attribute is uint32_t, a bit-field of ::hsa_region_global_flag_t + * values. + */ + HSA_REGION_INFO_GLOBAL_FLAGS = 1, + /** + * Size of this region, in bytes. The type of this attribute is size_t. + */ + HSA_REGION_INFO_SIZE = 2, + /** + * Maximum allocation size in this region, in bytes. Must not exceed the value + * of ::HSA_REGION_INFO_SIZE. The type of this attribute is size_t. + * + * If the region is in the global or readonly segments, this is the maximum + * size that the application can pass to ::hsa_memory_allocate. If the region + * is in the group segment, this is the maximum size (per work-group) that can + * be requested for a given kernel dispatch. If the region is in the private + * segment, this is the maximum size (per work-item) that can be request for a + * specific kernel dispatch. + */ + HSA_REGION_INFO_ALLOC_MAX_SIZE = 4, + /** + * Indicates whether memory in this region can be allocated using + * ::hsa_memory_allocate. The type of this attribute is bool. + * + * The value of this flag is always false for regions in the group and private + * segments. + */ + HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED = 5, + /** + * Allocation granularity of buffers allocated by ::hsa_memory_allocate in + * this region. The size of a buffer allocated in this region is a multiple of + * the value of this attribute. The value of this attribute is only defined if + * ::HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED is true for this region. The type + * of this attribute is size_t. + */ + HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE = 6, + /** + * Alignment of buffers allocated by ::hsa_memory_allocate in this region. The + * value of this attribute is only defined if + * ::HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED is true for this region, and must + * be a power of 2. The type of this attribute is size_t. + */ + HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT = 7 +} hsa_region_info_t; + +/** + * @brief Get the current value of an attribute of a region. + * + * @param[in] region A valid region. + * + * @param[in] attribute Attribute to query. + * + * @param[out] value Pointer to a application-allocated buffer where to store + * the value of the attribute. If the buffer passed by the application is not + * large enough to hold the value of @p attribute, the behavior is undefined. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_REGION The region is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid + * region attribute, or @p value is NULL. + */ +hsa_status_t HSA_API hsa_region_get_info(hsa_region_t region, + hsa_region_info_t attribute, + void *value); + +/** + * @brief Iterate over the memory regions associated with a given agent, and + * invoke an application-defined callback on every iteration. + * + * @param[in] agent A valid agent. + * + * @param[in] callback Callback to be invoked once per region that is + * accessible from the agent. The HSA runtime passes two arguments to the + * callback, the region and the application data. If @p callback returns a + * status other than ::HSA_STATUS_SUCCESS for a particular iteration, the + * traversal stops and ::hsa_agent_iterate_regions returns that status value. + * + * @param[in] data Application data that is passed to @p callback on every + * iteration. May be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL. + */ +hsa_status_t HSA_API hsa_agent_iterate_regions( + hsa_agent_t agent, + hsa_status_t (*callback)(hsa_region_t region, void *data), void *data); + +/** + * @brief Allocate a block of memory in a given region. + * + * @param[in] region Region where to allocate memory from. The region must have + * the ::HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED flag set. + * + * @param[in] size Allocation size, in bytes. Must not be zero. This value is + * rounded up to the nearest multiple of ::HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE + * in @p region. + * + * @param[out] ptr Pointer to the location where to store the base address of + * the allocated block. The returned base address is aligned to the value of + * ::HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT in @p region. If the allocation + * fails, the returned value is undefined. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES No memory is available. + * + * @retval ::HSA_STATUS_ERROR_INVALID_REGION The region is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ALLOCATION The host is not allowed to + * allocate memory in @p region, or @p size is greater than the value of + * HSA_REGION_INFO_ALLOC_MAX_SIZE in @p region. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p ptr is NULL, or @p size is 0. + */ +hsa_status_t HSA_API + hsa_memory_allocate(hsa_region_t region, size_t size, void **ptr); + +/** + * @brief Deallocate a block of memory previously allocated using + * ::hsa_memory_allocate. + * + * @param[in] ptr Pointer to a memory block. If @p ptr does not match a value + * previously returned by ::hsa_memory_allocate, the behavior is undefined. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + */ +hsa_status_t HSA_API hsa_memory_free(void *ptr); + +/** + * @brief Copy a block of memory. + * + * @param[out] dst Buffer where the content is to be copied. + * + * @param[in] src A valid pointer to the source of data to be copied. + * + * @param[in] size Number of bytes to copy. If @p size is 0, no copy is + * performed and the function returns success. Copying a number of bytes larger + * than the size of the buffers pointed by @p dst or @p src results in undefined + * behavior. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT The source or destination + * pointers are NULL. + */ +hsa_status_t HSA_API hsa_memory_copy(void *dst, const void *src, size_t size); + +/** + * @brief Change the ownership of a global, coarse-grained buffer. + * + * @details The contents of a coarse-grained buffer are visible to an agent + * only after ownership has been explicitely transferred to that agent. Once the + * operation completes, the previous owner cannot longer access the data in the + * buffer. + * + * An implementation of the HSA runtime is allowed, but not required, to change + * the physical location of the buffer when ownership is transferred to a + * different agent. In general the application must not assume this + * behavior. The virtual location (address) of the passed buffer is never + * modified. + * + * @param[in] ptr Base address of a global buffer. The pointer should match an + * address previously returned by ::hsa_memory_allocate. The size of the buffer + * affected by the ownership change is identical to the size of that previous + * allocation. If @p ptr points to a fine-grained global buffer, no operation is + * performed and the function returns success. If @p ptr does not point to + * global memory, the behavior is undefined. + * + * @param[in] agent Agent that becomes the owner of the buffer. The + * application is responsible for ensuring that @p agent has access to the + * region that contains the buffer. It is allowed to change ownership to an + * agent that is already the owner of the buffer, with the same or different + * access permissions. + * + * @param[in] access Access permissions requested for the new owner. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime is unable to + * acquire the resources required by the operation. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p ptr is NULL, or @p access is + * not a valid access value. + */ +hsa_status_t HSA_API hsa_memory_assign_agent(void *ptr, hsa_agent_t agent, + hsa_access_permission_t access); + +/** + * + * @brief Register a global, fine-grained buffer. + * + * @details Registering a buffer serves as an indication to the HSA runtime that + * the memory might be accessed from a kernel agent other than the + * host. Registration is a performance hint that allows the HSA runtime + * implementation to know which buffers will be accessed by some of the kernel + * agents ahead of time. + * + * Registration is only recommended for buffers in the global segment that have + * not been allocated using the HSA allocator (::hsa_memory_allocate), but an OS + * allocator instead. + * + * Registrations should not overlap. + * + * @param[in] ptr A buffer in global memory. If a NULL pointer is passed, no + * operation is performed. + * + * @param[in] size Requested registration size in bytes. A size of 0 is + * only allowed if @p ptr is NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES There is a failure in + * allocating the necessary resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p size is 0 but @p ptr + * is not NULL. + */ +hsa_status_t HSA_API hsa_memory_register(void *ptr, size_t size); + +/** + * + * @brief Deregister memory previously registered using ::hsa_memory_register. + * + * @details If the memory interval being deregistered does not match a previous + * registration (start and end addresses), the behavior is undefined. + * + * @param[in] ptr A pointer to the base of the buffer to be deregistered. If + * a NULL pointer is passed, no operation is performed. + * + * @param[in] size Size of the buffer to be deregistered. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + */ +hsa_status_t HSA_API hsa_memory_deregister(void *ptr, size_t size); + +/** @} */ + +/** \defgroup symbol-attributes Symbol Attributes + * @{ + */ + +/** + * @brief Symbol type. + */ +typedef enum { + /** + * Variable. + */ + HSA_SYMBOL_KIND_VARIABLE = 0, + /** + * Kernel. + */ + HSA_SYMBOL_KIND_KERNEL = 1, + /** + * Indirect function. + */ + HSA_SYMBOL_KIND_INDIRECT_FUNCTION = 2 +} hsa_symbol_kind_t; + +/** + * @brief Allocation type of a variable. + */ +typedef enum { + /** + * Agent allocation. + */ + HSA_VARIABLE_ALLOCATION_AGENT = 0, + /** + * Program allocation. + */ + HSA_VARIABLE_ALLOCATION_PROGRAM = 1 +} hsa_variable_allocation_t; + +/** + * @brief Linkage type of a symbol. + */ +typedef enum { + /** + * Module linkage. + */ + HSA_SYMBOL_LINKAGE_MODULE = 0, + /** + * Program linkage. + */ + HSA_SYMBOL_LINKAGE_PROGRAM = 1 +} hsa_symbol_linkage_t; + +/** + * @brief Memory segment associated with a variable. + */ +typedef enum { + /** + * Global memory segment. + */ + HSA_VARIABLE_SEGMENT_GLOBAL = 0, + /** + * Readonly memory segment. + */ + HSA_VARIABLE_SEGMENT_READONLY = 1 +} hsa_variable_segment_t; + +/** @} */ + +/** \defgroup code-object Code Object + * @{ + */ + +/** + * @brief Instruction set architecture. + */ +typedef struct hsa_isa_s { + /** + * Opaque handle. + */ + uint64_t handle; +} hsa_isa_t; + +/** + * @brief Retrieve a reference to an ISA handle out of a symbolic name. + * + * @param[in] name Vendor-specific name associated with a particular instruction + * set architecture. Must be a NUL-terminated string. + * + * @param[out] isa Memory location where the HSA runtime stores the ISA handle + * corresponding to the given name. Must not be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p name is NULL, or @p isa is + * NULL. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ISA_NAME The given name does not + * correspond to any instruction set architecture. + */ +hsa_status_t HSA_API hsa_isa_from_name( + const char* name, + hsa_isa_t* isa); + +/** + * @brief Instruction set architecture attributes. + */ +typedef enum { + /** + * The length of the ISA name. The type of this attribute is uint32_t. + */ + HSA_ISA_INFO_NAME_LENGTH = 0, + /** + * Human-readable description. The type of this attribute is character array + * with the length equal to the value of ::HSA_ISA_INFO_NAME_LENGTH attribute. + */ + HSA_ISA_INFO_NAME = 1, + /** + * Number of call conventions supported by the instruction set architecture. + * The type of this attribute is uint32_t. + */ + HSA_ISA_INFO_CALL_CONVENTION_COUNT = 2, + /** + * Number of work-items in a wavefront for a given call convention. Must be a + * power of 2 in the range [1,256]. The type of this attribute is uint32_t. + */ + HSA_ISA_INFO_CALL_CONVENTION_INFO_WAVEFRONT_SIZE = 3, + /** + * Number of wavefronts per compute unit for a given call convention. In + * practice, other factors (for example, the amount of group memory used by a + * work-group) may further limit the number of wavefronts per compute + * unit. The type of this attribute is uint32_t. + */ + HSA_ISA_INFO_CALL_CONVENTION_INFO_WAVEFRONTS_PER_COMPUTE_UNIT = 4 +} hsa_isa_info_t; + +/** + * @brief Get the current value of an attribute for a given instruction set + * architecture (ISA). + * + * @param[in] isa A valid instruction set architecture. + * + * @param[in] attribute Attribute to query. + * + * @param[in] index Call convention index. Used only for call convention + * attributes, otherwise ignored. Must have a value between 0 (inclusive) and + * the value of the attribute ::HSA_ISA_INFO_CALL_CONVENTION_COUNT (not + * inclusive) in @p isa. + * + * @param[out] value Pointer to an application-allocated buffer where to store + * the value of the attribute. If the buffer passed by the application is not + * large enough to hold the value of @p attribute, the behavior is undefined. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ISA The instruction set architecture is + * invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_INDEX @p index out of range. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid + * instruction set architecture attribute, or @p value is NULL. + */ +hsa_status_t HSA_API hsa_isa_get_info( + hsa_isa_t isa, + hsa_isa_info_t attribute, + uint32_t index, + void* value); + +/** + * @brief Check if the instruction set architecture of a code object can be + * executed on an agent associated with another architecture. + * + * @param[in] code_object_isa Instruction set architecture associated with a + * code object. + * + * @param[in] agent_isa Instruction set architecture associated with an agent. + * + * @param[out] result Pointer to a memory location where the HSA runtime stores + * the result of the check. If the two architectures are compatible, the result + * is true; if they are incompatible, the result is false. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ISA @p code_object_isa or @p agent_isa are + * invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p result is NULL. + */ +hsa_status_t HSA_API hsa_isa_compatible( + hsa_isa_t code_object_isa, + hsa_isa_t agent_isa, + bool* result); + +/** + * @brief An opaque handle to a code object, which contains ISA for finalized + * kernels and indirect functions together with information about the + * global/readonly segment variables they reference. + */ +typedef struct hsa_code_object_s { + /** + * Opaque handle. + */ + uint64_t handle; +} hsa_code_object_t; + +/** + * @brief Opaque handle to application data that is passed to the serialization + * and deserialization functions. + */ +typedef struct hsa_callback_data_s { + /** + * Opaque handle. + */ + uint64_t handle; +} hsa_callback_data_t; + +/** + * @brief Serialize a code object. Can be used for offline finalization, + * install-time finalization, disk code caching, etc. + * + * @param[in] code_object Code object. + * + * @param[in] alloc_callback Callback function for memory allocation. Must not + * be NULL. The HSA runtime passes three arguments to the callback: the + * allocation size, the application data, and a pointer to a memory location + * where the application stores the allocation result. The HSA runtime invokes + * @p alloc_callback once to allocate a buffer that contains the serialized + * version of @p code_object. If the callback returns a status code other than + * ::HSA_STATUS_SUCCESS, this function returns the same code. + * + * @param[in] callback_data Application data that is passed to @p + * alloc_callback. May be NULL. + * + * @param[in] options Vendor-specific options. May be NULL. + * + * @param[out] serialized_code_object Memory location where the HSA runtime + * stores a pointer to the serialized code object. Must not be NULL. + * + * @param[out] serialized_code_object_size Memory location where the HSA runtime + * stores the size (in bytes) of @p serialized_code_object. The returned value + * matches the allocation size passed by the HSA runtime to @p + * alloc_callback. Must not be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES There is a failure to allocate + * resources required for the operation. + * + * @retval ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT @p code_object is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p alloc_callback, @p + * serialized_code_object, or @p serialized_code_object_size are NULL. + */ +hsa_status_t HSA_API hsa_code_object_serialize( + hsa_code_object_t code_object, + hsa_status_t (*alloc_callback)(size_t size, hsa_callback_data_t data, void **address), + hsa_callback_data_t callback_data, + const char *options, + void **serialized_code_object, + size_t *serialized_code_object_size); + +/** + * @brief Deserialize a code object. + * + * @param[in] serialized_code_object A serialized code object. Must not be NULL. + * + * @param[in] serialized_code_object_size The size (in bytes) of @p + * serialized_code_object. Must not be 0. + * + * @param[in] options Vendor-specific options. May be NULL. + * + * @param[out] code_object Memory location where the HSA runtime stores the + * deserialized code object. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES There is a failure to allocate + * resources required for the operation. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p serialized_code_object, or @p + * code_object are NULL. @p serialized_code_object_size is 0. + */ +hsa_status_t HSA_API hsa_code_object_deserialize( + void *serialized_code_object, + size_t serialized_code_object_size, + const char *options, + hsa_code_object_t *code_object); + +/** + * @brief Destroy a code object. + * + * @details The lifetime of a code object must exceed that of any executable + * where it has been loaded. If an executable that loaded @p code_object has not + * been destroyed, the behavior is undefined. + * + * @param[in] code_object Code object. The handle becomes invalid after it has + * been destroyed. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT @p code_object is invalid. + */ +hsa_status_t HSA_API hsa_code_object_destroy( + hsa_code_object_t code_object); + +/** + * @brief Code object type. + */ +typedef enum { + /** + * Produces code object that contains ISA for all kernels and indirect + * functions in HSA source. + */ + HSA_CODE_OBJECT_TYPE_PROGRAM = 0 +} hsa_code_object_type_t; + +/** + * @brief Code object attributes. + */ +typedef enum { + /** + * The version of the code object. The type of this attribute is a + * NUL-terminated char[64]. If the version of the code object uses less than + * 63 characters, the rest of the array must be filled with NULs. + */ + HSA_CODE_OBJECT_INFO_VERSION = 0, + /** + * Type of code object. The type of this attribute is + * ::hsa_code_object_type_t. + */ + HSA_CODE_OBJECT_INFO_TYPE = 1, + /** + * Instruction set architecture this code object is produced for. The type of + * this attribute is ::hsa_isa_t. + */ + HSA_CODE_OBJECT_INFO_ISA = 2, + /** + * Machine model this code object is produced for. The type of this attribute + * is ::hsa_machine_model_t. + */ + HSA_CODE_OBJECT_INFO_MACHINE_MODEL = 3, + /** + * Profile this code object is produced for. The type of this attribute is + * ::hsa_profile_t. + */ + HSA_CODE_OBJECT_INFO_PROFILE = 4, + /** + * Default floating-point rounding mode used when the code object is + * produced. The type of this attribute is + * ::hsa_default_float_rounding_mode_t. + */ + HSA_CODE_OBJECT_INFO_DEFAULT_FLOAT_ROUNDING_MODE = 5 +} hsa_code_object_info_t; + +/** + * @brief Get the current value of an attribute for a given code object. + * + * @param[in] code_object Code object. + * + * @param[in] attribute Attribute to query. + * + * @param[out] value Pointer to an application-allocated buffer where to store + * the value of the attribute. If the buffer passed by the application is not + * large enough to hold the value of @p attribute, the behavior is undefined. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid + * code object attribute, or @p value is NULL. + * + * @retval ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT @p code_object is invalid. + */ +hsa_status_t HSA_API hsa_code_object_get_info( + hsa_code_object_t code_object, + hsa_code_object_info_t attribute, + void *value); + +/** + * @brief Code object symbol. + */ +typedef struct hsa_code_symbol_s { + /** + * Opaque handle. + */ + uint64_t handle; +} hsa_code_symbol_t; + +/** + * @brief Get the symbol handle within a code object for a given a symbol name. + * + * @param[in] code_object Code object. + * + * @param[in] symbol_name Symbol name. + * + * @param[out] symbol Memory location where the HSA runtime stores the symbol + * handle. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT @p code_object is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_SYMBOL_NAME There is no symbol with a name + * that matches @p symbol_name. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p symbol_name is NULL, or + * @p symbol is NULL. + */ +hsa_status_t HSA_API hsa_code_object_get_symbol( + hsa_code_object_t code_object, + const char *symbol_name, + hsa_code_symbol_t *symbol); + +/** + * @brief Code object symbol attributes. + */ +typedef enum { + /** + * The type of the symbol. The type of this attribute is ::hsa_symbol_kind_t. + */ + HSA_CODE_SYMBOL_INFO_TYPE = 0, + /** + * The length of the symbol name. The type of this attribute is uint32_t. + */ + HSA_CODE_SYMBOL_INFO_NAME_LENGTH = 1, + /** + * The name of the symbol. The type of this attribute is character array with + * the length equal to the value of ::HSA_CODE_SYMBOL_INFO_NAME_LENGTH + * attribute + */ + HSA_CODE_SYMBOL_INFO_NAME = 2, + /** + * The length of the module name to which this symbol belongs if this symbol + * has module linkage, otherwise 0 is returned. The type of this attribute is + * uint32_t. + */ + HSA_CODE_SYMBOL_INFO_MODULE_NAME_LENGTH = 3, + /** + * The module name to which this symbol belongs if this symbol has module + * linkage, otherwise empty string is returned. The type of this attribute is + * character array with the length equal to the value of + * ::HSA_CODE_SYMBOL_INFO_MODULE_NAME_LENGTH attribute. + */ + HSA_CODE_SYMBOL_INFO_MODULE_NAME = 4, + /** + * The linkage kind of the symbol. The type of this attribute is + * ::hsa_symbol_linkage_t. + */ + HSA_CODE_SYMBOL_INFO_LINKAGE = 5, + /** + * Indicates whether the symbol corresponds to a definition. The type of this + * attribute is bool. + */ + HSA_CODE_SYMBOL_INFO_IS_DEFINITION = 17, + /** + * The allocation kind of the variable. The value of this attribute is + * undefined if the symbol is not a variable. The type of this attribute is + * ::hsa_variable_allocation_t. + */ + HSA_CODE_SYMBOL_INFO_VARIABLE_ALLOCATION = 6, + /** + * The segment kind of the variable. The value of this attribute is + * undefined if the symbol is not a variable. The type of this attribute is + * ::hsa_variable_segment_t. + */ + HSA_CODE_SYMBOL_INFO_VARIABLE_SEGMENT = 7, + /** + * Alignment of the variable. The value of this attribute is undefined if the + * symbol is not a variable. The type of this attribute is uint32_t. + */ + HSA_CODE_SYMBOL_INFO_VARIABLE_ALIGNMENT = 8, + /** + * Size of the variable. The value of this attribute is undefined if the + * symbol is not a variable. The type of this attribute is uint32_t. + * + * A size of 0 is returned if the variable is an external variable and has an + * unknown dimension. + */ + HSA_CODE_SYMBOL_INFO_VARIABLE_SIZE = 9, + /** + * Indicates whether the variable is constant. The value of this attribute is + * undefined if the symbol is not a variable. The type of this attribute is + * bool. + */ + HSA_CODE_SYMBOL_INFO_VARIABLE_IS_CONST = 10, + /** + * Size of kernarg segment memory that is required to hold the values of the + * kernel arguments, in bytes. The value of this attribute is undefined if the + * symbol is not a kernel. The type of this attribute is uint32_t. + */ + HSA_CODE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE = 11, + /** + * Alignment (in bytes) of the buffer used to pass arguments to the kernel, + * which is the maximum of 16 and the maximum alignment of any of the kernel + * arguments. The value of this attribute is undefined if the symbol is not a + * kernel. The type of this attribute is uint32_t. + */ + HSA_CODE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_ALIGNMENT = 12, + /** + * Size of static group segment memory required by the kernel (per + * work-group), in bytes. The value of this attribute is undefined + * if the symbol is not a kernel. The type of this attribute is uint32_t. + * + * The reported amount does not include any dynamically allocated group + * segment memory that may be requested by the application when a kernel is + * dispatched. + */ + HSA_CODE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE = 13, + /** + * Size of static private, spill, and arg segment memory required by + * this kernel (per work-item), in bytes. The value of this attribute is + * undefined if the symbol is not a kernel. The type of this attribute is + * uint32_t. + * + * If the value of ::HSA_CODE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK is true, + * the kernel may use more private memory than the reported value, and the + * application must add the dynamic call stack usage to @a + * private_segment_size when populating a kernel dispatch packet. + */ + HSA_CODE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE = 14, + /** + * Dynamic callstack flag. The value of this attribute is undefined if the + * symbol is not a kernel. The type of this attribute is bool. + * + * If this flag is set (the value is true), the kernel uses a dynamically + * sized call stack. This can happen if recursive calls, calls to indirect + * functions, or the HSAIL alloca instruction are present in the kernel. + */ + HSA_CODE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK = 15, + /** + * Call convention of the indirect function. The value of this attribute is + * undefined if the symbol is not an indirect function. The type of this + * attribute is uint32_t. + */ + HSA_CODE_SYMBOL_INFO_INDIRECT_FUNCTION_CALL_CONVENTION = 16 +} hsa_code_symbol_info_t; + +/** + * @brief Get the current value of an attribute for a given code symbol. + * + * @param[in] code_symbol Code symbol. + * + * @param[in] attribute Attribute to query. + * + * @param[out] value Pointer to an application-allocated buffer where to store + * the value of the attribute. If the buffer passed by the application is not + * large enough to hold the value of @p attribute, the behavior is undefined. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid + * code symbol attribute, or @p value is NULL. + */ +hsa_status_t HSA_API hsa_code_symbol_get_info( + hsa_code_symbol_t code_symbol, + hsa_code_symbol_info_t attribute, + void *value); + +/** + * @brief Iterate over the symbols in a code object, and invoke an + * application-defined callback on every iteration. + * + * @param[in] code_object Code object. + * + * @param[in] callback Callback to be invoked once per code object symbol. The + * HSA runtime passes three arguments to the callback: the code object, a + * symbol, and the application data. If @p callback returns a status other than + * ::HSA_STATUS_SUCCESS for a particular iteration, the traversal stops and + * ::hsa_code_object_iterate_symbols returns that status value. + * + * @param[in] data Application data that is passed to @p callback on every + * iteration. May be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT @p code_object is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL. + */ +hsa_status_t HSA_API hsa_code_object_iterate_symbols( + hsa_code_object_t code_object, + hsa_status_t (*callback)(hsa_code_object_t code_object, hsa_code_symbol_t symbol, void* data), + void* data); + +/** @} */ + +/** \defgroup executable Executable + * @{ + */ + +/** + * @brief An opaque handle to an executable, which contains ISA for finalized + * kernels and indirect functions together with the allocated global/readonly + * segment variables they reference. + */ +typedef struct hsa_executable_s { + /** + * Opaque handle. + */ + uint64_t handle; +} hsa_executable_t; + +/** + * @brief Executable state. + */ +typedef enum { + /** + * Executable state, which allows the user to load code objects and define + * external variables. Variable addresses, kernel code handles, and + * indirect function code handles are not available in query operations until + * the executable is frozen (zero always returned). + */ + HSA_EXECUTABLE_STATE_UNFROZEN = 0, + /** + * Executable state, which allows the user to query variable addresses, + * kernel code handles, and indirect function code handles using query + * operation. Loading new code objects, as well as defining external variables + * is not allowed in this state. + */ + HSA_EXECUTABLE_STATE_FROZEN = 1 +} hsa_executable_state_t; + +/** + * @brief Create an empty executable. + * + * @param[in] profile Profile used in the executable. + * + * @param[in] executable_state Executable state. If the state is + * ::HSA_EXECUTABLE_STATE_FROZEN, the resulting executable is useless because no + * code objects can be loaded, and no variables can be defined. + * + * @param[in] options Vendor-specific options. May be NULL. + * + * @param[out] executable Memory location where the HSA runtime stores newly + * created executable handle. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES There is a failure to allocate + * resources required for the operation. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p profile is invalid, or + * @p executable is NULL. + */ +hsa_status_t HSA_API hsa_executable_create( + hsa_profile_t profile, + hsa_executable_state_t executable_state, + const char *options, + hsa_executable_t *executable); + +/** + * @brief Destroy an executable. + * + * @details Executable handle becomes invalid after the executable has been + * destroyed. Code object handles that were loaded into this executable are + * still valid after the executable has been destroyed, and can be used as + * intended. Resources allocated outside and associated with this executable + * (such as external global/readonly variables) can be released after the + * executable has been destroyed. + * + * Executable should not be destroyed while kernels are in flight. + * + * @param[in] executable Executable. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid. + */ +hsa_status_t HSA_API hsa_executable_destroy( + hsa_executable_t executable); + +/** + * @brief Load code object into the executable. + * + * @details Every global/readonly variable that is external must be defined + * using define set of operations before loading code objects. Internal + * global/readonly variable is allocated once the code object, that is being + * loaded, references this variable and this variable is not allocated. + * + * Any module linkage declaration must have been defined either by a define + * variable or by loading a code object that has a symbol with module linkage + * definition. + * + * @param[in] executable Executable. + * + * @param[in] agent Agent to load code object for. The agent must support the + * default floating-point rounding mode used by @p code_object. + * + * @param[in] code_object Code object to load. The lifetime of the code object + * must exceed that of the executable: if @p code_object is destroyed before @p + * executable, the behavior is undefined. + * + * @param[in] options Vendor-specific options. May be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES There is a failure to allocate + * resources required for the operation. + * + * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT @p code_object is invalid. + * + * @retval ::HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS @p agent is not compatible + * with @p code_object (for example, @p agent does not support the default + * floating-point rounding mode specified by @p code_object), or @p code_object + * is not compatible with @p executable (for example, @p code_object and @p + * executable have different machine models or profiles). + * + * @retval ::HSA_STATUS_ERROR_FROZEN_EXECUTABLE @p executable is frozen. + */ +hsa_status_t HSA_API hsa_executable_load_code_object( + hsa_executable_t executable, + hsa_agent_t agent, + hsa_code_object_t code_object, + const char *options); + +/** + * @brief Freeze the executable. + * + * @details No modifications to executable can be made after freezing: no + * code objects can be loaded to the executable, no external variables can + * be defined. Freezing the executable does not prevent querying executable's + * attributes. + * + * @param[in] executable Executable. + * + * @param[in] options Vendor-specific options. May be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid. + * + * @retval ::HSA_STATUS_ERROR_VARIABLE_UNDEFINED One or more variable is + * undefined in the executable. + * + * @retval ::HSA_STATUS_ERROR_FROZEN_EXECUTABLE @p executable is already frozen. + */ +hsa_status_t HSA_API hsa_executable_freeze( + hsa_executable_t executable, + const char *options); + +/** + * @brief Executable attributes. + */ +typedef enum { + /** + * Profile this executable is created for. The type of this attribute is + * ::hsa_profile_t. + */ + HSA_EXECUTABLE_INFO_PROFILE = 1, + /** + * Executable state. The type of this attribute is ::hsa_executable_state_t. + */ + HSA_EXECUTABLE_INFO_STATE = 2 +} hsa_executable_info_t; + +/** + * @brief Get the current value of an attribute for a given executable. + * + * @param[in] executable Executable. + * + * @param[in] attribute Attribute to query. + * + * @param[out] value Pointer to an application-allocated buffer where to store + * the value of the attribute. If the buffer passed by the application is not + * large enough to hold the value of @p attribute, the behavior is undefined. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid + * executable attribute, or @p value is NULL. + */ +hsa_status_t HSA_API hsa_executable_get_info( + hsa_executable_t executable, + hsa_executable_info_t attribute, + void *value); + +/** + * @brief Define an external global variable with program allocation. + * + * @details This function allows the application to provide the definition + * of a variable in the global segment memory with program allocation. The + * variable must be defined before loading a code object into an executable. + * In addition, code objects loaded must not define the variable. + * + * @param[in] executable Executable. + * + * @param[in] variable_name Name of the variable. + * + * @param[in] address Address where the variable is defined. The buffer pointed + * by @p address is owned by the application, and cannot be deallocated before + * @p executable is destroyed. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES There is a failure to allocate + * resources required for the operation. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p variable_name is NULL. + * + * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid. + * + * @retval ::HSA_STATUS_ERROR_VARIABLE_ALREADY_DEFINED The variable is + * already defined. + * + * @retval ::HSA_STATUS_ERROR_INVALID_SYMBOL_NAME There is no variable with the + * @p variable_name. + * + * @retval ::HSA_STATUS_ERROR_FROZEN_EXECUTABLE @p executable is frozen. + */ +hsa_status_t HSA_API hsa_executable_global_variable_define( + hsa_executable_t executable, + const char *variable_name, + void *address); + +/** + * @brief Define an external global variable with agent allocation. + * + * @details This function allows the application to provide the definition + * of a variable in the global segment memory with agent allocation. The + * variable must be defined before loading a code object into an executable. + * In addition, code objects loaded must not define the variable. + * + * @param[in] executable Executable. + * + * @param[in] agent Agent for which the variable is being defined. + * + * @param[in] variable_name Name of the variable. + * + * @param[in] address Address where the variable is defined. The buffer pointed + * by @p address is owned by the application, and cannot be deallocated before + * @p executable is destroyed. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES There is a failure to allocate + * resources required for the operation. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p variable_name is NULL. + * + * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT @p agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_VARIABLE_ALREADY_DEFINED The variable is + * already defined. + * + * @retval ::HSA_STATUS_ERROR_INVALID_SYMBOL_NAME There is no variable with the + * @p variable_name. + * + * @retval ::HSA_STATUS_ERROR_FROZEN_EXECUTABLE @p executable is frozen. + */ +hsa_status_t HSA_API hsa_executable_agent_global_variable_define( + hsa_executable_t executable, + hsa_agent_t agent, + const char *variable_name, + void *address); + +/** + * @brief Define an external readonly variable. + * + * @details This function allows the application to provide the definition + * of a variable in the readonly segment memory. The variable must be defined + * before loading a code object into an executable. In addition, code objects + * loaded must not define the variable. + * + * @param[in] executable Executable. + * + * @param[in] agent Agent for which the variable is being defined. + * + * @param[in] variable_name Name of the variable. + * + * @param[in] address Address where the variable is defined. The buffer pointed + * by @p address is owned by the application, and cannot be deallocated before + * @p executable is destroyed. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES There is a failure to allocate + * resources required for the operation. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p variable_name is NULL. + * + * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE Executable is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT @p agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_VARIABLE_ALREADY_DEFINED The variable is + * already defined. + * + * @retval ::HSA_STATUS_ERROR_INVALID_SYMBOL_NAME There is no variable with the + * @p variable_name. + * + * @retval ::HSA_STATUS_ERROR_FROZEN_EXECUTABLE @p executable is frozen. + */ +hsa_status_t HSA_API hsa_executable_readonly_variable_define( + hsa_executable_t executable, + hsa_agent_t agent, + const char *variable_name, + void *address); + +/** + * @brief Validate executable. Checks that all code objects have matching + * machine model, profile, and default floating-point rounding mode. Checks that + * all declarations have definitions. Checks declaration-definition + * compatibility (see HSA Programming Reference Manual for compatibility rules). + * + * @param[in] executable Executable. + * + * @param[out] result Memory location where the HSA runtime stores the + * validation result. If the executable is valid, the result is 0. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE @p executable is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p result is NULL. + */ +hsa_status_t HSA_API hsa_executable_validate( + hsa_executable_t executable, + uint32_t* result); + +/** + * @brief Executable symbol. + */ +typedef struct hsa_executable_symbol_s { + /** + * Opaque handle. + */ + uint64_t handle; +} hsa_executable_symbol_t; + +/** + * @brief Get the symbol handle for a given a symbol name. + * + * @param[in] executable Executable. + * + * @param[in] module_name Module name. Must be NULL if the symbol has + * program linkage. + * + * @param[in] symbol_name Symbol name. + * + * @param[in] agent Agent associated with the symbol. If the symbol is + * independent of any agent (for example, a variable with program + * allocation), this argument is ignored. + * + * @param[in] call_convention Call convention associated with the symbol. If the + * symbol does not correspond to an indirect function, this argument is ignored. + * + * @param[out] symbol Memory location where the HSA runtime stores the symbol + * handle. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_SYMBOL_NAME There is no symbol with a name + * that matches @p symbol_name. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p symbol_name is NULL, or + * @p symbol is NULL. + */ +hsa_status_t HSA_API hsa_executable_get_symbol( + hsa_executable_t executable, + const char *module_name, + const char *symbol_name, + hsa_agent_t agent, + int32_t call_convention, + hsa_executable_symbol_t *symbol); + +/** + * @brief Executable symbol attributes. + */ +typedef enum { + /** + * The kind of the symbol. The type of this attribute is ::hsa_symbol_kind_t. + */ + HSA_EXECUTABLE_SYMBOL_INFO_TYPE = 0, + /** + * The length of the symbol name. The type of this attribute is uint32_t. + */ + HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH = 1, + /** + * The name of the symbol. The type of this attribute is character array with + * the length equal to the value of ::HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH + * attribute + */ + HSA_EXECUTABLE_SYMBOL_INFO_NAME = 2, + /** + * The length of the module name to which this symbol belongs if this symbol + * has module linkage, otherwise 0 is returned. The type of this attribute is + * uint32_t. + */ + HSA_EXECUTABLE_SYMBOL_INFO_MODULE_NAME_LENGTH = 3, + /** + * The module name to which this symbol belongs if this symbol has module + * linkage, otherwise empty string is returned. The type of this attribute is + * character array with the length equal to the value of + * ::HSA_EXECUTABLE_SYMBOL_INFO_MODULE_NAME_LENGTH attribute. + */ + HSA_EXECUTABLE_SYMBOL_INFO_MODULE_NAME = 4, + /** + * Agent associated with this symbol. If the symbol is a variable, the + * value of this attribute is only defined if + * ::HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ALLOCATION is + * ::HSA_VARIABLE_ALLOCATION_AGENT. The type of this attribute is hsa_agent_t. + */ + HSA_EXECUTABLE_SYMBOL_INFO_AGENT = 20, + /** + * The address of the variable. The value of this attribute is undefined if + * the symbol is not a variable. The type of this attribute is uint64_t. + * + * If executable's state is ::HSA_EXECUTABLE_STATE_UNFROZEN, then 0 is + * returned. + */ + HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS = 21, + /** + * The linkage kind of the symbol. The type of this attribute is + * ::hsa_symbol_linkage_t. + */ + HSA_EXECUTABLE_SYMBOL_INFO_LINKAGE = 5, + /** + * Indicates whether the symbol corresponds to a definition. The type of this + * attribute is bool. + */ + HSA_EXECUTABLE_SYMBOL_INFO_IS_DEFINITION = 17, + /** + * The allocation kind of the variable. The value of this attribute is + * undefined if the symbol is not a variable. The type of this attribute is + * ::hsa_variable_allocation_t. + */ + HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ALLOCATION = 6, + /** + * The segment kind of the variable. The value of this attribute is undefined + * if the symbol is not a variable. The type of this attribute is + * ::hsa_variable_segment_t. + */ + HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SEGMENT = 7, + /** + * Alignment of the variable. The value of this attribute is undefined if + * the symbol is not a variable. The type of this attribute is uint32_t. + */ + HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ALIGNMENT = 8, + /** + * Size of the variable. The value of this attribute is undefined if + * the symbol is not a variable. The type of this attribute is uint32_t. + * + * A value of 0 is returned if the variable is an external variable and has an + * unknown dimension. + */ + HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE = 9, + /** + * Indicates whether the variable is constant. The value of this attribute is + * undefined if the symbol is not a variable. The type of this attribute is + * bool. + */ + HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_IS_CONST = 10, + /** + * Kernel object handle, used in the kernel dispatch packet. The value of this + * attribute is undefined if the symbol is not a kernel. The type of this + * attribute is uint64_t. + * + * If the state of the executable is ::HSA_EXECUTABLE_STATE_UNFROZEN, then 0 + * is returned. + */ + HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT = 22, + /** + * Size of kernarg segment memory that is required to hold the values of the + * kernel arguments, in bytes. The value of this attribute is undefined if the + * symbol is not a kernel. The type of this attribute is uint32_t. + */ + HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE = 11, + /** + * Alignment (in bytes) of the buffer used to pass arguments to the kernel, + * which is the maximum of 16 and the maximum alignment of any of the kernel + * arguments. The value of this attribute is undefined if the symbol is not a + * kernel. The type of this attribute is uint32_t. + */ + HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_ALIGNMENT = 12, + /** + * Size of static group segment memory required by the kernel (per + * work-group), in bytes. The value of this attribute is undefined + * if the symbol is not a kernel. The type of this attribute is uint32_t. + * + * The reported amount does not include any dynamically allocated group + * segment memory that may be requested by the application when a kernel is + * dispatched. + */ + HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE = 13, + /** + * Size of static private, spill, and arg segment memory required by + * this kernel (per work-item), in bytes. The value of this attribute is + * undefined if the symbol is not a kernel. The type of this attribute is + * uint32_t. + * + * If the value of ::HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK is + * true, the kernel may use more private memory than the reported value, and + * the application must add the dynamic call stack usage to @a + * private_segment_size when populating a kernel dispatch packet. + */ + HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE = 14, + /** + * Dynamic callstack flag. The value of this attribute is undefined if the + * symbol is not a kernel. The type of this attribute is bool. + * + * If this flag is set (the value is true), the kernel uses a dynamically + * sized call stack. This can happen if recursive calls, calls to indirect + * functions, or the HSAIL alloca instruction are present in the kernel. + */ + HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK = 15, + /** + * Indirect function object handle. The value of this attribute is undefined + * if the symbol is not an indirect function, or the associated agent does + * not support the Full Profile. The type of this attribute depends on the + * machine model: if machine model is small, then the type is uint32_t, if + * machine model is large, then the type is uint64_t. + * + * If the state of the executable is ::HSA_EXECUTABLE_STATE_UNFROZEN, then 0 + * is returned. + */ + HSA_EXECUTABLE_SYMBOL_INFO_INDIRECT_FUNCTION_OBJECT = 23, + /** + * Call convention of the indirect function. The value of this attribute is + * undefined if the symbol is not an indirect function, or the associated + * agent does not support the Full Profile. The type of this attribute is + * uint32_t. + */ + HSA_EXECUTABLE_SYMBOL_INFO_INDIRECT_FUNCTION_CALL_CONVENTION = 16 +} hsa_executable_symbol_info_t; + +/** + * @brief Get the current value of an attribute for a given executable symbol. + * + * @param[in] executable_symbol Executable symbol. + * + * @param[in] attribute Attribute to query. + * + * @param[out] value Pointer to an application-allocated buffer where to store + * the value of the attribute. If the buffer passed by the application is not + * large enough to hold the value of @p attribute, the behavior is undefined. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid + * executable symbol attribute, or @p value is NULL. + */ +hsa_status_t HSA_API hsa_executable_symbol_get_info( + hsa_executable_symbol_t executable_symbol, + hsa_executable_symbol_info_t attribute, + void *value); + +/** + * @brief Iterate over the symbols in a executable, and invoke an + * application-defined callback on every iteration. + * + * @param[in] executable Executable. + * + * @param[in] callback Callback to be invoked once per executable symbol. The + * HSA runtime passes three arguments to the callback: the executable, a symbol, + * and the application data. If @p callback returns a status other than + * ::HSA_STATUS_SUCCESS for a particular iteration, the traversal stops and + * ::hsa_executable_iterate_symbols returns that status value. + * + * @param[in] data Application data that is passed to @p callback on every + * iteration. May be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE Th executable is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL. + */ +hsa_status_t HSA_API hsa_executable_iterate_symbols( + hsa_executable_t executable, + hsa_status_t (*callback)(hsa_executable_t executable, hsa_executable_symbol_t symbol, void* data), + void* data); + +/** @} */ + +#ifdef __cplusplus +} // end extern "C" block +#endif + +#endif // header guard diff --git a/runtime/hsa-runtime/inc/hsa_api_trace.h b/runtime/hsa-runtime/inc/hsa_api_trace.h new file mode 100644 index 0000000000..5bfba0c1cd --- /dev/null +++ b/runtime/hsa-runtime/inc/hsa_api_trace.h @@ -0,0 +1,177 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef HSA_RUNTIME_INC_HSA_API_TRACE_H +#define HSA_RUNTIME_INC_HSA_API_TRACE_H + +#include "hsa.h" +#ifdef AMD_INTERNAL_BUILD +#include "hsa_ext_image.h" +#include "hsa_ext_amd.h" +#include "hsa_ext_finalize.h" +#else +#include "inc/hsa_ext_image.h" +#include "inc/hsa_ext_amd.h" +#include "inc/hsa_ext_finalize.h" +#endif + +struct ExtTable { + decltype(hsa_ext_program_create)* hsa_ext_program_create_fn; + decltype(hsa_ext_program_destroy)* hsa_ext_program_destroy_fn; + decltype(hsa_ext_program_add_module)* hsa_ext_program_add_module_fn; + decltype(hsa_ext_program_iterate_modules)* hsa_ext_program_iterate_modules_fn; + decltype(hsa_ext_program_get_info)* hsa_ext_program_get_info_fn; + decltype(hsa_ext_program_finalize)* hsa_ext_program_finalize_fn; + decltype(hsa_ext_image_get_capability)* hsa_ext_image_get_capability_fn; + decltype(hsa_ext_image_data_get_info)* hsa_ext_image_data_get_info_fn; + decltype(hsa_ext_image_create)* hsa_ext_image_create_fn; + decltype(hsa_ext_image_import)* hsa_ext_image_import_fn; + decltype(hsa_ext_image_export)* hsa_ext_image_export_fn; + decltype(hsa_ext_image_copy)* hsa_ext_image_copy_fn; + decltype(hsa_ext_image_clear)* hsa_ext_image_clear_fn; + decltype(hsa_ext_image_destroy)* hsa_ext_image_destroy_fn; + decltype(hsa_ext_sampler_create)* hsa_ext_sampler_create_fn; + decltype(hsa_ext_sampler_destroy)* hsa_ext_sampler_destroy_fn; +}; + +struct ApiTable { + decltype(hsa_init)* hsa_init_fn; + decltype(hsa_shut_down)* hsa_shut_down_fn; + decltype(hsa_system_get_info)* hsa_system_get_info_fn; + decltype(hsa_system_extension_supported)* hsa_system_extension_supported_fn; + decltype(hsa_system_get_extension_table)* hsa_system_get_extension_table_fn; + decltype(hsa_iterate_agents)* hsa_iterate_agents_fn; + decltype(hsa_agent_get_info)* hsa_agent_get_info_fn; + decltype(hsa_queue_create)* hsa_queue_create_fn; + decltype(hsa_soft_queue_create)* hsa_soft_queue_create_fn; + decltype(hsa_queue_destroy)* hsa_queue_destroy_fn; + decltype(hsa_queue_inactivate)* hsa_queue_inactivate_fn; + decltype(hsa_queue_load_read_index_acquire)* hsa_queue_load_read_index_acquire_fn; + decltype(hsa_queue_load_read_index_relaxed)* hsa_queue_load_read_index_relaxed_fn; + decltype(hsa_queue_load_write_index_acquire)* hsa_queue_load_write_index_acquire_fn; + decltype(hsa_queue_load_write_index_relaxed)* hsa_queue_load_write_index_relaxed_fn; + decltype(hsa_queue_store_write_index_relaxed)* hsa_queue_store_write_index_relaxed_fn; + decltype(hsa_queue_store_write_index_release)* hsa_queue_store_write_index_release_fn; + decltype(hsa_queue_cas_write_index_acq_rel)* hsa_queue_cas_write_index_acq_rel_fn; + decltype(hsa_queue_cas_write_index_acquire)* hsa_queue_cas_write_index_acquire_fn; + decltype(hsa_queue_cas_write_index_relaxed)* hsa_queue_cas_write_index_relaxed_fn; + decltype(hsa_queue_cas_write_index_release)* hsa_queue_cas_write_index_release_fn; + decltype(hsa_queue_add_write_index_acq_rel)* hsa_queue_add_write_index_acq_rel_fn; + decltype(hsa_queue_add_write_index_acquire)* hsa_queue_add_write_index_acquire_fn; + decltype(hsa_queue_add_write_index_relaxed)* hsa_queue_add_write_index_relaxed_fn; + decltype(hsa_queue_add_write_index_release)* hsa_queue_add_write_index_release_fn; + decltype(hsa_queue_store_read_index_relaxed)* hsa_queue_store_read_index_relaxed_fn; + decltype(hsa_queue_store_read_index_release)* hsa_queue_store_read_index_release_fn; + decltype(hsa_agent_iterate_regions)* hsa_agent_iterate_regions_fn; + decltype(hsa_region_get_info)* hsa_region_get_info_fn; + decltype(hsa_agent_get_exception_policies)* hsa_agent_get_exception_policies_fn; + decltype(hsa_agent_extension_supported)* hsa_agent_extension_supported_fn; + decltype(hsa_memory_register)* hsa_memory_register_fn; + decltype(hsa_memory_deregister)* hsa_memory_deregister_fn; + decltype(hsa_memory_allocate)* hsa_memory_allocate_fn; + decltype(hsa_memory_free)* hsa_memory_free_fn; + decltype(hsa_memory_copy)* hsa_memory_copy_fn; + decltype(hsa_memory_assign_agent)* hsa_memory_assign_agent_fn; + decltype(hsa_signal_create)* hsa_signal_create_fn; + decltype(hsa_signal_destroy)* hsa_signal_destroy_fn; + decltype(hsa_signal_load_relaxed)* hsa_signal_load_relaxed_fn; + decltype(hsa_signal_load_acquire)* hsa_signal_load_acquire_fn; + decltype(hsa_signal_store_relaxed)* hsa_signal_store_relaxed_fn; + decltype(hsa_signal_store_release)* hsa_signal_store_release_fn; + decltype(hsa_signal_wait_relaxed)* hsa_signal_wait_relaxed_fn; + decltype(hsa_signal_wait_acquire)* hsa_signal_wait_acquire_fn; + decltype(hsa_signal_and_relaxed)* hsa_signal_and_relaxed_fn; + decltype(hsa_signal_and_acquire)* hsa_signal_and_acquire_fn; + decltype(hsa_signal_and_release)* hsa_signal_and_release_fn; + decltype(hsa_signal_and_acq_rel)* hsa_signal_and_acq_rel_fn; + decltype(hsa_signal_or_relaxed)* hsa_signal_or_relaxed_fn; + decltype(hsa_signal_or_acquire)* hsa_signal_or_acquire_fn; + decltype(hsa_signal_or_release)* hsa_signal_or_release_fn; + decltype(hsa_signal_or_acq_rel)* hsa_signal_or_acq_rel_fn; + decltype(hsa_signal_xor_relaxed)* hsa_signal_xor_relaxed_fn; + decltype(hsa_signal_xor_acquire)* hsa_signal_xor_acquire_fn; + decltype(hsa_signal_xor_release)* hsa_signal_xor_release_fn; + decltype(hsa_signal_xor_acq_rel)* hsa_signal_xor_acq_rel_fn; + decltype(hsa_signal_exchange_relaxed)* hsa_signal_exchange_relaxed_fn; + decltype(hsa_signal_exchange_acquire)* hsa_signal_exchange_acquire_fn; + decltype(hsa_signal_exchange_release)* hsa_signal_exchange_release_fn; + decltype(hsa_signal_exchange_acq_rel)* hsa_signal_exchange_acq_rel_fn; + decltype(hsa_signal_add_relaxed)* hsa_signal_add_relaxed_fn; + decltype(hsa_signal_add_acquire)* hsa_signal_add_acquire_fn; + decltype(hsa_signal_add_release)* hsa_signal_add_release_fn; + decltype(hsa_signal_add_acq_rel)* hsa_signal_add_acq_rel_fn; + decltype(hsa_signal_subtract_relaxed)* hsa_signal_subtract_relaxed_fn; + decltype(hsa_signal_subtract_acquire)* hsa_signal_subtract_acquire_fn; + decltype(hsa_signal_subtract_release)* hsa_signal_subtract_release_fn; + decltype(hsa_signal_subtract_acq_rel)* hsa_signal_subtract_acq_rel_fn; + decltype(hsa_signal_cas_relaxed)* hsa_signal_cas_relaxed_fn; + decltype(hsa_signal_cas_acquire)* hsa_signal_cas_acquire_fn; + decltype(hsa_signal_cas_release)* hsa_signal_cas_release_fn; + decltype(hsa_signal_cas_acq_rel)* hsa_signal_cas_acq_rel_fn; + decltype(hsa_isa_from_name)* hsa_isa_from_name_fn; + decltype(hsa_isa_get_info)* hsa_isa_get_info_fn; + decltype(hsa_isa_compatible)* hsa_isa_compatible_fn; + decltype(hsa_code_object_serialize)* hsa_code_object_serialize_fn; + decltype(hsa_code_object_deserialize)* hsa_code_object_deserialize_fn; + decltype(hsa_code_object_destroy)* hsa_code_object_destroy_fn; + decltype(hsa_code_object_get_info)* hsa_code_object_get_info_fn; + decltype(hsa_code_object_get_symbol)* hsa_code_object_get_symbol_fn; + decltype(hsa_code_symbol_get_info)* hsa_code_symbol_get_info_fn; + decltype(hsa_code_object_iterate_symbols)* hsa_code_object_iterate_symbols_fn; + decltype(hsa_executable_create)* hsa_executable_create_fn; + decltype(hsa_executable_destroy)* hsa_executable_destroy_fn; + decltype(hsa_executable_load_code_object)* hsa_executable_load_code_object_fn; + decltype(hsa_executable_freeze)* hsa_executable_freeze_fn; + decltype(hsa_executable_get_info)* hsa_executable_get_info_fn; + decltype(hsa_executable_global_variable_define)* hsa_executable_global_variable_define_fn; + decltype(hsa_executable_agent_global_variable_define)* hsa_executable_agent_global_variable_define_fn; + decltype(hsa_executable_readonly_variable_define)* hsa_executable_readonly_variable_define_fn; + decltype(hsa_executable_validate)* hsa_executable_validate_fn; + decltype(hsa_executable_get_symbol)* hsa_executable_get_symbol_fn; + decltype(hsa_executable_symbol_get_info)* hsa_executable_symbol_get_info_fn; + decltype(hsa_executable_iterate_symbols)* hsa_executable_iterate_symbols_fn; + decltype(hsa_status_string)* hsa_status_string_fn; + + ExtTable* std_exts_; +}; + +#endif diff --git a/runtime/hsa-runtime/inc/hsa_ext_amd.h b/runtime/hsa-runtime/inc/hsa_ext_amd.h new file mode 100644 index 0000000000..bb32b05a0d --- /dev/null +++ b/runtime/hsa-runtime/inc/hsa_ext_amd.h @@ -0,0 +1,1183 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +// HSA AMD extension. + +#ifndef HSA_RUNTIME_EXT_AMD_H_ +#define HSA_RUNTIME_EXT_AMD_H_ + +#include "hsa.h" +#include "hsa_ext_image.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * @brief Enumeration constants added to ::hsa_status_t. + * + * @remark Additions to hsa_status_t + */ +enum { + /** + * The memory pool is invalid. + */ + HSA_STATUS_ERROR_INVALID_MEMORY_POOL = 40 +}; + +/** + * @brief Agent attributes. + */ +typedef enum hsa_amd_agent_info_s { + /** + * Chip identifier. The type of this attribute is uint32_t. + */ + HSA_AMD_AGENT_INFO_CHIP_ID = 0xA000, + /** + * Size of a cacheline in bytes. The type of this attribute is uint32_t. + */ + HSA_AMD_AGENT_INFO_CACHELINE_SIZE = 0xA001, + /** + * The number of compute unit available in the agent. The type of this + * attribute is uint32_t. + */ + HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT = 0xA002, + /** + * The maximum clock frequency of the agent in MHz. The type of this + * attribute is uint32_t. + */ + HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY = 0xA003, + /** + * Internal driver node identifier. The type of this attribute is uint32_t. + */ + HSA_AMD_AGENT_INFO_DRIVER_NODE_ID = 0xA004, + /** + * Max number of watch points on memory address ranges to generate exception + * events when the watched addresses are accessed. + */ + HSA_AMD_AGENT_INFO_MAX_ADDRESS_WATCH_POINTS = 0xA005, + /** + * Agent BDF_ID, named LocationID in thunk. The type of this attribute is + * uint16_t. + */ + HSA_AMD_AGENT_INFO_BDFID = 0xA006 +} hsa_amd_agent_info_t; + +/** + * @brief Region attributes. + */ +typedef enum hsa_amd_region_info_s { + /** + * Determine if host can access the region. The type of this attribute + * is bool. + */ + HSA_AMD_REGION_INFO_HOST_ACCESSIBLE = 0xA000, + /** + * Base address of the region in flat address space. + */ + HSA_AMD_REGION_INFO_BASE = 0xA001, + /** + * Memory Interface width, the return value type is uint32_t + */ + HSA_AMD_REGION_INFO_BUS_WIDTH = 0xA002, + /** + * Max Memory Clock, the return value type is uint32_t + */ + HSA_AMD_REGION_INFO_MAX_CLOCK_FREQUENCY = 0xA003 +} hsa_amd_region_info_t; + +/** + * @brief Coherency attributes of fine grain region. + */ +typedef enum hsa_amd_coherency_type_s { + /** + * Coherent region. + */ + HSA_AMD_COHERENCY_TYPE_COHERENT = 0, + /** + * Non coherent region. + */ + HSA_AMD_COHERENCY_TYPE_NONCOHERENT = 1 +} hsa_amd_coherency_type_t; + +/** + * @brief Get the coherency type of the fine grain region of an agent. + * + * @param[in] agent A valid agent. + * + * @param[out] type Pointer to a memory location where the HSA runtime will + * store the coherency type of the fine grain region. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p type is NULL. + */ +hsa_status_t HSA_API hsa_amd_coherency_get_type(hsa_agent_t agent, + hsa_amd_coherency_type_t* type); + +/** + * @brief Set the coherency type of the fine grain region of an agent. + * Deprecated. This is supported on KV platforms. For backward compatibility + * other platforms will spuriously succeed. + * + * @param[in] agent A valid agent. + * + * @param[in] type The coherency type to be set. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p type is invalid. + */ +hsa_status_t HSA_API hsa_amd_coherency_set_type(hsa_agent_t agent, + hsa_amd_coherency_type_t type); + +/** + * @brief Structure containing profiling dispatch time information. + * + * Times are reported as ticks in the domain of the HSA system clock. + * The HSA system clock tick and frequency is obtained via hsa_system_get_info. + */ +typedef struct hsa_amd_profiling_dispatch_time_s { + /** + * Dispatch packet processing start time. + */ + uint64_t start; + /** + * Dispatch packet completion time. + */ + uint64_t end; +} hsa_amd_profiling_dispatch_time_t; + +/** + * @brief Enable or disable profiling capability of a queue. + * + * @param[in] queue A valid queue. + * + * @param[in] enable 1 to enable profiling. 0 to disable profiling. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_QUEUE The queue is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p queue is NULL. + */ +hsa_status_t HSA_API + hsa_amd_profiling_set_profiler_enabled(hsa_queue_t* queue, int enable); + +/** + * @brief Retrieve packet processing time stamps. + * + * @param[in] agent The agent with which the signal was last used. For instance, + * if the profiled dispatch packet is dispatched on to queue Q, which was + * created on agent A, then this parameter must be A. + * + * @param[in] signal A signal used as the completion signal of the dispatch + * packet to retrieve time stamps from. This dispatch packet must have been + * issued to a queue with profiling enabled and have already completed. Also + * the signal must not have yet been used in any other packet following the + * completion of the profiled dispatch packet. + * + * @param[out] time Packet processing timestamps in the HSA system clock + * domain. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_SIGNAL The signal is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p time is NULL. + */ +hsa_status_t HSA_API hsa_amd_profiling_get_dispatch_time( + hsa_agent_t agent, hsa_signal_t signal, + hsa_amd_profiling_dispatch_time_t* time); + +/** + * @brief Computes the frequency ratio and offset between the agent clock and + * HSA system clock and converts the agent’s tick to HSA system domain tick. + * + * @param[in] agent The agent used to retrieve the agent_tick. It is user's + * responsibility to make sure the tick number is from this agent, otherwise, + * the behavior is undefined. + * + * @param[in] agent_tick The tick count retrieved from the specified @p agent. + * + * @param[out] system_tick The translated HSA system domain clock counter tick. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p system_tick is NULL; + */ +hsa_status_t HSA_API + hsa_amd_profiling_convert_tick_to_system_domain(hsa_agent_t agent, + uint64_t agent_tick, + uint64_t* system_tick); + +/** + * @brief Asyncronous signal handler function type. + * + * @details Type definition of callback function to be used with + * hsa_amd_signal_async_handler. This callback is invoked if the associated + * signal and condition are met. The callback receives the value of the signal + * which satisfied the associated wait condition and a user provided value. If + * the callback returns true then the callback will be called again if the + * associated signal and condition are satisfied again. If the callback returns + * false then it will not be called again. + * + * @param[in] value Contains the value of the signal observed by + * hsa_amd_signal_async_handler which caused the signal handler to be invoked. + * + * @param[in] arg Contains the user provided value given when the signal handler + * was registered with hsa_amd_signal_async_handler + * + * @retval true resumes monitoring the signal with this handler (as if calling + * hsa_amd_signal_async_handler again with identical parameters) + * + * @retval false stops monitoring the signal with this handler (handler will + * not be called again for this signal) + * + */ +typedef bool (*hsa_amd_signal_handler)(hsa_signal_value_t value, void* arg); + +/** + * @brief Register asynchronous signal handler function. + * + * @details Allows registering a callback function and user provided value with + * a signal and wait condition. The callback will be invoked if the associated + * signal and wait condition are satisfied. Callbacks will be invoked serially + * but in an arbitrary order so callbacks should be independent of each other. + * After being invoked a callback may continue to wait for its associated signal + * and condition and, possibly, be invoked again. Or the callback may stop + * waiting. If the callback returns true then it will continue waiting and may + * be called again. If false then the callback will not wait again and will not + * be called again for the associated signal and condition. It is possible to + * register the same callback multiple times with the same or different signals + * and/or conditions. Each registration of the callback will be treated entirely + * independently. + * + * @param[in] signal hsa signal to be asynchronously monitored + * + * @param[in] cond condition value to monitor for + * + * @param[in] value signal value used in condition expression + * + * @param[in] handler asynchronous signal handler invoked when signal's + * condition is met + * + * @param[in] arg user provided value which is provided to handler when handler + * is invoked + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_SIGNAL signal is not a valid hsa_signal_t + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT handler is invalid (NULL) + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime is out of + * resources or blocking signals are not supported by the HSA driver component. + * + */ +hsa_status_t HSA_API + hsa_amd_signal_async_handler(hsa_signal_t signal, + hsa_signal_condition_t cond, + hsa_signal_value_t value, + hsa_amd_signal_handler handler, void* arg); + +/** + * @brief Call a function asynchronously + * + * @details Provides access to the runtime's asynchronous event handling thread + * for general asynchronous functions. Functions queued this way are executed + * in the same manner as if they were a signal handler who's signal is + * satisfied. + * + * @param[in] callback asynchronous function to be invoked + * + * @param[in] arg user provided value which is provided to handler when handler + * is invoked + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT handler is invalid (NULL) + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime is out of + * resources or blocking signals are not supported by the HSA driver component. + * + */ +hsa_status_t HSA_API + hsa_amd_async_function(void (*callback)(void* arg), void* arg); + +/** + * @brief Wait for any signal-condition pair to be satisfied. + * + * @details Allows waiting for any of several signal and conditions pairs to be + * satisfied. The function returns the index into the list of signals of the + * first satisfying signal-condition pair. The value of the satisfying signal’s + * value is returned in satisfying_value unless satisfying_value is NULL. This + * function provides only relaxed memory semantics. + */ +uint32_t HSA_API + hsa_amd_signal_wait_any(uint32_t signal_count, hsa_signal_t* signals, + hsa_signal_condition_t* conds, + hsa_signal_value_t* values, uint64_t timeout_hint, + hsa_wait_state_t wait_hint, + hsa_signal_value_t* satisfying_value); + +/** + * @brief Query image limits. + * + * @param[in] agent A valid agent. + * + * @param[in] attribute HSA image info attribute to query. + * + * @param[out] value Pointer to an application-allocated buffer where to store + * the value of the attribute. If the buffer passed by the application is not + * large enough to hold the value of @p attribute, the behavior is undefined. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_QUEUE @p value is NULL or @p attribute < + * HSA_EXT_AGENT_INFO_IMAGE_1D_MAX_ELEMENTS or @p attribute > + * HSA_EXT_AGENT_INFO_IMAGE_ARRAY_MAX_LAYERS. + * + */ +hsa_status_t HSA_API hsa_amd_image_get_info_max_dim(hsa_agent_t agent, + hsa_agent_info_t attribute, + void* value); + +/** + * @brief Set a CU affinity to specific queues within the process, this function + * call is "atomic". + * + * @param[in] queue A pointer to HSA queue. + * + * @param[in] num_cu_mask_count Size of CUMask bit array passed in. + * + * @param[in] cu_mask Bit-vector representing the CU mask. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_QUEUE @p queue is NULL or invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p num_cu_mask_count is not + * multiple of 32 or @p cu_mask is NULL. + * + * @retval ::HSA_STATUS_ERROR failed to call thunk api + * + */ +hsa_status_t HSA_API hsa_amd_queue_cu_set_mask(const hsa_queue_t* queue, + uint32_t num_cu_mask_count, + const uint32_t* cu_mask); + +/** + * @brief Memory segments associated with a memory pool. + */ +typedef enum { + /** + * Global segment. Used to hold data that is shared by all agents. + */ + HSA_AMD_SEGMENT_GLOBAL = 0, + /** + * Read-only segment. Used to hold data that remains constant during the + * execution of a kernel. + */ + HSA_AMD_SEGMENT_READONLY = 1, + /** + * Private segment. Used to hold data that is local to a single work-item. + */ + HSA_AMD_SEGMENT_PRIVATE = 2, + /** + * Group segment. Used to hold data that is shared by the work-items of a + * work-group. + */ + HSA_AMD_SEGMENT_GROUP = 3, +} hsa_amd_segment_t; + +/** + * @brief A memory pool represents physical storage on an agent. + */ +typedef struct hsa_amd_memory_pool_s { + /** + * Opaque handle. + */ + uint64_t handle; +} hsa_amd_memory_pool_t; + +typedef enum hsa_amd_memory_pool_global_flag_s { + /** + * The application can use allocations in the memory pool to store kernel + * arguments, and provide the values for the kernarg segment of + * a kernel dispatch. + */ + HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT = 1, + /** + * Updates to memory in this pool conform to HSA memory consistency model. + * If this flag is set, then ::HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED + * must not be set. + */ + HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED = 2, + /** + * Writes to memory in this pool can be performed by a single agent at a time. + */ + HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED = 4 +} hsa_amd_memory_pool_global_flag_t; + +/** + * @brief Memory pool features. + */ +typedef enum { + /** + * Segment where the memory pool resides. The type of this attribute is + * ::hsa_amd_segment_t. + */ + HSA_AMD_MEMORY_POOL_INFO_SEGMENT = 0, + /** + * Flag mask. The value of this attribute is undefined if the value of + * ::HSA_AMD_MEMORY_POOL_INFO_SEGMENT is not ::HSA_AMD_SEGMENT_GLOBAL. The type + * of + * this attribute is uint32_t, a bit-field of + * ::hsa_amd_memory_pool_global_flag_t + * values. + */ + HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS = 1, + /** + * Size of this pool, in bytes. The type of this attribute is size_t. + */ + HSA_AMD_MEMORY_POOL_INFO_SIZE = 2, + /** + * Indicates whether memory in this pool can be allocated using + * ::hsa_amd_memory_pool_allocate. The type of this attribute is bool. + * + * The value of this flag is always false for memory pools in the group and + * private segments. + */ + HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED = 5, + /** + * Allocation granularity of buffers allocated by + * ::hsa_amd_memory_pool_allocate + * in this memory pool. The size of a buffer allocated in this pool is a + * multiple of the value of this attribute. The value of this attribute is + * only defined if ::HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED is true for + * this pool. The type of this attribute is size_t. + */ + HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE = 6, + /** + * Alignment of buffers allocated by ::hsa_amd_memory_pool_allocate in this + * pool. The value of this attribute is only defined if + * ::HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED is true for this pool, and + * must be a power of 2. The type of this attribute is size_t. + */ + HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT = 7, + /** + * This memory_pool can be made directly accessible by all the agents in the + * system (::hsa_amd_agent_memory_pool_get_info returns + * ::HSA_AMD_MEMORY_POOL_ACCESS_ALLOWED_BY_DEFAULT for all agents). The type of + * this attribute is bool. + */ + HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL = 15, +} hsa_amd_memory_pool_info_t; + +/** + * @brief Get the current value of an attribute of a memory pool. + * + * @param[in] memory_pool A valid memory pool. + * + * @param[in] attribute Attribute to query. + * + * @param[out] value Pointer to a application-allocated buffer where to store + * the value of the attribute. If the buffer passed by the application is not + * large enough to hold the value of @p attribute, the behavior is undefined. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + */ +hsa_status_t HSA_API + hsa_amd_memory_pool_get_info(hsa_amd_memory_pool_t memory_pool, + hsa_amd_memory_pool_info_t attribute, + void* value); + +/** + * @brief Iterate over the memory pools associated with a given agent, and + * invoke an application-defined callback on every iteration. + * + * @details An agent can directly access buffers located in some memory pool, or + * be enabled to access them by the application (see ::hsa_amd_agents_allow_access), + * yet that memory pool may not be returned by this function for that given + * agent. + * + * A memory pool of fine-grained type must be associated only with the host. + * + * @param[in] agent A valid agent. + * + * @param[in] callback Callback to be invoked on the same thread that called + * ::hsa_amd_agent_iterate_memory_pools, serially, once per memory pool that is + * associated with the agent. The HSA runtime passes two arguments to the + * callback: the memory pool, and the application data. If @p callback + * returns a status other than ::HSA_STATUS_SUCCESS for a particular iteration, + * the traversal stops and ::hsa_amd_agent_iterate_memory_pools returns that status + * value. + * + * @param[in] data Application data that is passed to @p callback on every + * iteration. May be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL. + */ +hsa_status_t HSA_API hsa_amd_agent_iterate_memory_pools( + hsa_agent_t agent, + hsa_status_t (*callback)(hsa_amd_memory_pool_t memory_pool, void* data), + void* data); + +/** + * @brief Allocate a block of memory (or buffer) in the specified pool. + * + * @param[in] memory_pool Memory pool where to allocate memory from. The memory + * pool must have the ::HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED flag set. + * + * @param[in] size Allocation size, in bytes. Must not be zero. This value is + * rounded up to the nearest multiple of + * ::HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE in @p memory_pool. + * + * @param[in] flags A bit-field that is used to specify allocation + * directives. Must be 0. + * + * @param[out] ptr Pointer to the location where to store the base virtual + * address of + * the allocated block. The returned base address is aligned to the value of + * ::HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT in @p memory_pool. If the + * allocation fails, the returned value is undefined. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES No memory is available. + * + * @retval ::HSA_STATUS_ERROR_INVALID_MEMORY_POOL The memory pool is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ALLOCATION The host is not allowed to + * allocate memory in @p memory_pool, or @p size is greater than the value of + * HSA_AMD_MEMORY_POOL_INFO_ALLOC_MAX_SIZE in @p memory_pool. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p ptr is NULL, or @p size is 0. + * + */ +hsa_status_t HSA_API + hsa_amd_memory_pool_allocate(hsa_amd_memory_pool_t memory_pool, size_t size, + uint32_t flags, void** ptr); + +/** + * @brief Deallocate a block of memory previously allocated using + * ::hsa_amd_memory_pool_allocate. + * + * @param[in] ptr Pointer to a memory block. If @p ptr does not match a value + * previously returned by ::hsa_amd_memory_pool_allocate, the behavior is undefined. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + */ +hsa_status_t HSA_API hsa_amd_memory_pool_free(void* ptr); + +/** + * @brief Asynchronously copy a block of memory from the location pointed to by + * @p src on the @p src_agent to the memory block pointed to by @p dst on the @p + * dst_agent. + * Because the DMA engines used may not be in the same coherency domain, the caller must ensure + * that buffers are system-level coherent. In general this requires the sending device to have + * released the buffer to system scope prior to executing the copy API and the receiving device + * must execute a system scope acquire fence prior to use of the destination buffer. + * + * @param[out] dst Buffer where the content is to be copied. + * + * @param[in] dst_agent Agent associated with the @p dst. The agent must be able to directly + * access both the source and destination buffers in their current locations. + * + * @param[in] src A valid pointer to the source of data to be copied. The source + * buffer must not overlap with the destination buffer, otherwise the copy will succeed + * but contents of @p dst is undefined. + * + * @param[in] src_agent Agent associated with the @p src. The agent must be able to directly + * access both the source and destination buffers in their current locations. + * + * @param[in] size Number of bytes to copy. If @p size is 0, no copy is + * performed and the function returns success. Copying a number of bytes larger + * than the size of the buffers pointed by @p dst or @p src results in undefined + * behavior. + * + * @param[in] num_dep_signals Number of dependent signals. Can be 0. + * + * @param[in] dep_signals List of signals that must be waited on before the copy + * operation starts. The copy will start after every signal has been observed with + * the value 0. The dependent signal should not include completion signal from hsa_amd_memory_async_copy + * operation to be issued in future as that can result in a deadlock. If @p num_dep_signals is 0, this + * argument is ignored. + * + * @param[in] completion_signal Signal used to indicate completion of the copy + * operation. When the copy operation is finished, the value of the signal is + * decremented. The runtime indicates that an error has occurred during the copy + * operation by setting the value of the completion signal to a negative + * number. The signal handle must not be 0. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. The + * application is responsible for checking for asynchronous error conditions + * (see the description of @p completion_signal). + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_SIGNAL @p completion_signal is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT The source or destination + * pointers are NULL, or the completion signal is 0. + */ +hsa_status_t HSA_API + hsa_amd_memory_async_copy(void* dst, hsa_agent_t dst_agent, const void* src, + hsa_agent_t src_agent, size_t size, + uint32_t num_dep_signals, + const hsa_signal_t* dep_signals, + hsa_signal_t completion_signal); + +/** + * @brief Type of accesses to a memory pool from a given agent. + */ +typedef enum { + /** + * The agent cannot directly access any buffer in the memory pool. + */ + HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED = 0, + /** + * The agent can directly access a buffer located in the pool; the application + * does not need to invoke ::hsa_amd_agents_allow_access. + */ + HSA_AMD_MEMORY_POOL_ACCESS_ALLOWED_BY_DEFAULT = 1, + /** + * The agent can directly access a buffer located in the pool, but only if the + * application has previously requested access to that buffer using + * ::hsa_amd_agents_allow_access. + */ + HSA_AMD_MEMORY_POOL_ACCESS_DISALLOWED_BY_DEFAULT = 2 +} hsa_amd_memory_pool_access_t; + +/** + * @brief Properties of the relationship between an agent a memory pool. + */ +typedef enum { + /** + * Hyper-transport bus type. + */ + HSA_AMD_LINK_INFO_TYPE_HYPERTRANSPORT = 0, + + /** + * QPI bus type. + */ + HSA_AMD_LINK_INFO_TYPE_QPI = 1, + + /** + * PCIe bus type. + */ + HSA_AMD_LINK_INFO_TYPE_PCIE = 2, + + /** + * Infiniband bus type. + */ + HSA_AMD_LINK_INFO_TYPE_INFINBAND = 3 + +} hsa_amd_link_info_type_t; + +/** + * @brief Link properties when accessing the memory pool from the specified + * agent. + */ +typedef struct hsa_amd_memory_pool_link_info_s { + /** + * Minimum transfer latency (rounded to ns). + */ + uint32_t min_latency; + + /** + * Maximum transfer latency (rounded to ns). + */ + uint32_t max_latency; + + /** + * Minimum link interface bandwidth in MB/s. + */ + uint32_t min_bandwidth; + + /** + * Maximum link interface bandwidth in MB/s. + */ + uint32_t max_bandwidth; + + /** + * Support for 32-bit atomic transactions. + */ + bool atomic_support_32bit; + + /** + * Support for 64-bit atomic transactions. + */ + bool atomic_support_64bit; + + /** + * Support for cache coherent transactions. + */ + bool coherent_support; + + /** + * The type of bus/link. + */ + hsa_amd_link_info_type_t link_type; + +} hsa_amd_memory_pool_link_info_t; + +/** + * @brief Properties of the relationship between an agent a memory pool. + */ +typedef enum { + /** + * Access to buffers located in the memory pool. The type of this attribute + * is ::hsa_amd_memory_pool_access_t. + * + * An agent can always directly access buffers currently located in a memory + * pool that is associated (the memory_pool is one of the values returned by + * ::hsa_amd_agent_iterate_memory_pools on the agent) with that agent. If the + * buffer is currently located in a memory pool that is not associated with + * the agent, and the value returned by this function for the given + * combination of agent and memory pool is not + * HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED, the application still needs to invoke + * ::hsa_amd_agents_allow_access in order to gain direct access to the buffer. + * + * If the given agent can directly access buffers the pool, the result is not + * HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED. If the memory pool is associated with + * the agent, or it is of fined-grained type, the result must not be + * HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED. If the memory pool is not associated + * with the agent, and does not reside in the global segment, the result must + * be HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED. + */ + HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS = 0, + + /** + * Number of links to hop when accessing the memory pool from the specified + * agent. The type of this attribute is uint32_t. + */ + HSA_AMD_AGENT_MEMORY_POOL_INFO_NUM_LINK_HOPS = 1, + + /** + * Details of each link hop when accessing the memory pool starting from the + * specified agent. The type of this attribute is an array size of + * HSA_AMD_AGENT_MEMORY_POOL_INFO_NUM_LINK_HOPS with each element containing + * ::hsa_amd_memory_pool_link_info_t. + */ + HSA_AMD_AGENT_MEMORY_POOL_INFO_LINK_INFO = 2 + +} hsa_amd_agent_memory_pool_info_t; + +/** + * @brief Get the current value of an attribute of the relationship between an + * agent and a memory pool. + * + * @param[in] agent Agent. + * + * @param[in] memory_pool Memory pool. + * + * @param[in] attribute Attribute to query. + * + * @param[out] value Pointer to a application-allocated buffer where to store + * the value of the attribute. If the buffer passed by the application is not + * large enough to hold the value of @p attribute, the behavior is undefined. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + */ +hsa_status_t HSA_API hsa_amd_agent_memory_pool_get_info( + hsa_agent_t agent, hsa_amd_memory_pool_t memory_pool, + hsa_amd_agent_memory_pool_info_t attribute, void* value); + +/** + * @brief Enable direct access to a buffer from a given set of agents. + * + * @details + * + * Upon return, only the listed agents and the agent associated with the + * buffer's memory pool have direct access to the @p ptr. + * + * Any agent that has access to the buffer before and after the call to + * ::hsa_amd_agents_allow_access will also have access while + * ::hsa_amd_agents_allow_access is in progress. + * + * The caller is responsible for ensuring that each agent in the list + * must be able to access the memory pool containing @p ptr + * (using ::hsa_amd_agent_memory_pool_get_info with ::HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS attribute), + * otherwise error code is returned. + * + * @param[in] num_agents Size of @p agents. + * + * @param[in] agents List of agents. If @p num_agents is 0, this argument is + * ignored. + * + * @param[in] flags A list of bit-field that is used to specify access + * information in a per-agent basis. The size of this list must match that of @p + * agents. Must be NULL. + * + * @param[in] ptr A buffer previously allocated using ::hsa_amd_memory_pool_allocate. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p num_agents is 0, or @p agents + * is NULL, @p flags is NULL, or attempting to enable access to agent(s) because + * @p ptr is allocated from an inaccessible pool. + * + */ +hsa_status_t HSA_API + hsa_amd_agents_allow_access(uint32_t num_agents, const hsa_agent_t* agents, + const uint32_t* flags, const void* ptr); + +/** + * @brief Query if buffers currently located in some memory pool can be + * relocated to a destination memory pool. + * + * @details If the returned value is non-zero, a migration of a buffer to @p + * dst_memory_pool using ::hsa_amd_memory_migrate may nevertheless fail due to + * resource limitations. + * + * @param[in] src_memory_pool Source memory pool. + * + * @param[in] dst_memory_pool Destination memory pool. + * + * @param[out] result Pointer to a memory location where the result of the query + * is stored. Must not be NULL. If buffers currently located in @p + * src_memory_pool can be relocated to @p dst_memory_pool, the result is + * true. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_MEMORY_POOL One of the memory pools is + * invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p result is NULL. + */ +hsa_status_t HSA_API + hsa_amd_memory_pool_can_migrate(hsa_amd_memory_pool_t src_memory_pool, + hsa_amd_memory_pool_t dst_memory_pool, + bool* result); + +/** + * @brief Relocate a buffer to a new memory pool. + * + * @details When a buffer is migrated, its virtual address remains the same but + * its physical contents are moved to the indicated memory pool. + * + * After migration, only the agent associated with the destination pool will have access. + * + * The caller is also responsible for ensuring that the allocation in the + * source memory pool where the buffer is currently located can be migrated to the + * specified destination memory pool (using ::hsa_amd_memory_pool_can_migrate returns a value of true + * for the source and destination memory pools), otherwise behavior is undefined. + * + * The caller must ensure that the buffer is not accessed while it is migrated. + * + * @param[in] ptr Buffer to be relocated. The buffer must have been released to system + * prior to call this API. The buffer will be released to system upon completion. + * + * @param[in] memory_pool Memory pool where to place the buffer. + * + * @param[in] flags A bit-field that is used to specify migration + * information. Must be zero. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_MEMORY_POOL The destination memory pool is + * invalid. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES There is a failure in + * allocating the necessary resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p flags is not 0. + */ +hsa_status_t HSA_API hsa_amd_memory_migrate(const void* ptr, + hsa_amd_memory_pool_t memory_pool, + uint32_t flags); + +/** + * + * @brief Pin a host pointer allocated by C/C++ or OS allocator (i.e. ordinary system DRAM) and return a new + * pointer accessible by the @p agents. If the @p host_ptr overlaps with previously locked + * memory, then the overlap area is kept locked (i.e multiple mappings are permitted). In this case, + * the same input @p host_ptr may give different locked @p agent_ptr and when it does, they + * are not necessarily coherent (i.e. accessing either @p agent_ptr is not equivalent). + * + * @param[in] host_ptr A buffer allocated by C/C++ or OS allocator. + * + * @param[in] size The size to be locked. + * + * @param[in] agents Array of agent handle to gain access to the @p host_ptr. + * If this parameter is NULL and the @p num_agent is 0, all agents + * in the platform will gain access to the @p host_ptr. + * + * @param[out] agent_ptr Pointer to the location where to store the new address. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES There is a failure in + * allocating the necessary resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT One or more agent in @p agents is + * invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p size is 0 or @p host_ptr or + * @p agent_ptr is NULL or @p agents not NULL but @p num_agent is 0 or @p agents + * is NULL but @p num_agent is not 0. + */ + +hsa_status_t HSA_API hsa_amd_memory_lock(void* host_ptr, size_t size, + hsa_agent_t* agents, int num_agent, + void** agent_ptr); + +/** + * + * @brief Unpin the host pointer previously pinned via ::hsa_amd_memory_lock. + * + * @details The behavior is undefined if the host pointer being unpinned does not + * match previous pinned address or if the host pointer was already deallocated. + * + * @param[in] host_ptr A buffer allocated by C/C++ or OS allocator that was + * pinned previously via ::hsa_amd_memory_lock. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + */ +hsa_status_t HSA_API hsa_amd_memory_unlock(void* host_ptr); + +/** + * @brief Sets the first @p num of uint32_t of the block of memory pointed by + * @p ptr to the specified @p value. + * + * @param[in] ptr Pointer to the block of memory to fill. + * + * @param[in] value Value to be set. + * + * @param[in] count Number of uint32_t element to be set to the value. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p ptr is NULL or + * not 4 bytes aligned + * + */ +hsa_status_t HSA_API + hsa_amd_memory_fill(void* ptr, uint32_t value, size_t count); + +/** + * @brief Maps an interop object into the HSA flat address space and establishes + * memory residency. The metadata pointer is valid during the lifetime of the + * map (until hsa_amd_interop_unmap_buffer is called). + * Multiple calls to hsa_amd_interop_map_buffer with the same interop_handle + * result in multiple mappings with potentially different addresses and + * different metadata pointers. Concurrent operations on these addresses are + * not coherent. Memory must be fenced to system scope to ensure consistency, + * between mappings and with any views of this buffer in the originating + * software stack. + * + * @param[in] num_agents Number of agents which require access to the memory + * + * @param[in] agents List of accessing agents. + * + * @param[in] interop_handle Handle of interop buffer (dmabuf handle in Linux) + * + * @param [in] flags Reserved, must be 0 + * + * @param[out] size Size in bytes of the mapped object + * + * @param[out] ptr Base address of the mapped object + * + * @param[out] metadata_size Size of metadata in bytes, may be NULL + * + * @param[out] metadata Pointer to metadata, may be NULL + * + * @retval HSA_STATUS_SUCCESS if successfully mapped + * + * @retval HSA_STATUS_ERROR_NOT_INITIALIZED if HSA is not initialized + * + * @retval HSA_STATUS_ERROR_OUT_OF_RESOURCES if there is a failure in allocating + * necessary resources + * + * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT all other errors + */ +hsa_status_t HSA_API hsa_amd_interop_map_buffer(uint32_t num_agents, + hsa_agent_t* agents, + int interop_handle, + uint32_t flags, + size_t* size, + void** ptr, + size_t* metadata_size, + const void** metadata); + +/** + * @brief Removes a previously mapped interop object from HSA's flat address space. + * Ends lifetime for the mapping's associated metadata pointer. + */ +hsa_status_t HSA_API hsa_amd_interop_unmap_buffer(void* ptr); + +/** + * @brief Encodes an opaque vendor specific image format. The length of data + * depends on the underlying format. This structure must not be copied as its + * true length can not be determined. + */ +typedef struct hsa_amd_image_descriptor_s { + /* + Version number of the descriptor + */ + uint32_t version; + + /* + Vendor and device PCI IDs for the format as VENDOR_ID<<16|DEVICE_ID. + */ + uint32_t deviceID; + + /* + Start of vendor specific data. + */ + uint32_t data[0]; +} hsa_amd_image_descriptor_t; + +/** + * @brief Creates an image from an opaque vendor specific image format. + * Does not modify data at image_data. Intended initially for + * accessing interop images. + * + * @param agent[in] Agent on which to create the image + * + * @param[in] image_descriptor[in] Vendor specific image format + * + * @param[in] image_data Pointer to image backing store + * + * @param[in] access_permission Access permissions for the image object + * + * @param[out] image Created image object. + * + * @retval HSA_STATUS_SUCCESS Image created successfully + * + * @retval HSA_STATUS_ERROR_NOT_INITIALIZED if HSA is not initialized + * + * @retval HSA_STATUS_ERROR_OUT_OF_RESOURCES if there is a failure in allocating + * necessary resources + * + * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT Bad or mismatched descriptor, + * null image_data, or mismatched access_permission. + */ +hsa_status_t HSA_API hsa_amd_image_create( + hsa_agent_t agent, + const hsa_ext_image_descriptor_t *image_descriptor, + const hsa_amd_image_descriptor_t *image_layout, + const void *image_data, + hsa_access_permission_t access_permission, + hsa_ext_image_t *image +); + +#ifdef __cplusplus +} // end extern "C" block +#endif + +#endif // header guard diff --git a/runtime/hsa-runtime/inc/hsa_ext_finalize.h b/runtime/hsa-runtime/inc/hsa_ext_finalize.h new file mode 100644 index 0000000000..1aeb92d0bb --- /dev/null +++ b/runtime/hsa-runtime/inc/hsa_ext_finalize.h @@ -0,0 +1,531 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef HSA_RUNTIME_INC_HSA_EXT_FINALIZE_H_ +#define HSA_RUNTIME_INC_HSA_EXT_FINALIZE_H_ + +#include "hsa.h" + +#undef HSA_API +#ifdef HSA_EXPORT_FINALIZER +#define HSA_API HSA_API_EXPORT +#else +#define HSA_API HSA_API_IMPORT +#endif + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +struct BrigModuleHeader; +typedef struct BrigModuleHeader* BrigModule_t; + +/** \defgroup ext-alt-finalizer-extensions Finalization Extensions + * @{ + */ + +/** + * @brief Enumeration constants added to ::hsa_status_t by this extension. + */ +enum { + /** + * The HSAIL program is invalid. + */ + HSA_EXT_STATUS_ERROR_INVALID_PROGRAM = 0x2000, + /** + * The HSAIL module is invalid. + */ + HSA_EXT_STATUS_ERROR_INVALID_MODULE = 0x2001, + /** + * Machine model or profile of the HSAIL module do not match the machine model + * or profile of the HSAIL program. + */ + HSA_EXT_STATUS_ERROR_INCOMPATIBLE_MODULE = 0x2002, + /** + * The HSAIL module is already a part of the HSAIL program. + */ + HSA_EXT_STATUS_ERROR_MODULE_ALREADY_INCLUDED = 0x2003, + /** + * Compatibility mismatch between symbol declaration and symbol definition. + */ + HSA_EXT_STATUS_ERROR_SYMBOL_MISMATCH = 0x2004, + /** + * The finalization encountered an error while finalizing a kernel or + * indirect function. + */ + HSA_EXT_STATUS_ERROR_FINALIZATION_FAILED = 0x2005, + /** + * Mismatch between a directive in the control directive structure and in + * the HSAIL kernel. + */ + HSA_EXT_STATUS_ERROR_DIRECTIVE_MISMATCH = 0x2006 +}; + +/** @} */ + +/** \defgroup ext-alt-finalizer-program Finalization Program + * @{ + */ + +/** + * @brief HSAIL (BRIG) module. The HSA Programmer's Reference Manual contains + * the definition of the BrigModule_t type. + */ +typedef BrigModule_t hsa_ext_module_t; + +/** + * @brief An opaque handle to a HSAIL program, which groups a set of HSAIL + * modules that collectively define functions and variables used by kernels and + * indirect functions. + */ +typedef struct hsa_ext_program_s { + /** + * Opaque handle. + */ + uint64_t handle; +} hsa_ext_program_t; + +/** + * @brief Create an empty HSAIL program. + * + * @param[in] machine_model Machine model used in the HSAIL program. + * + * @param[in] profile Profile used in the HSAIL program. + * + * @param[in] default_float_rounding_mode Default float rounding mode used in + * the HSAIL program. + * + * @param[in] options Vendor-specific options. May be NULL. + * + * @param[out] program Memory location where the HSA runtime stores the newly + * created HSAIL program handle. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES There is a failure to allocate + * resources required for the operation. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p machine_model is invalid, + * @p profile is invalid, @p default_float_rounding_mode is invalid, or + * @p program is NULL. + */ +hsa_status_t HSA_API hsa_ext_program_create( + hsa_machine_model_t machine_model, + hsa_profile_t profile, + hsa_default_float_rounding_mode_t default_float_rounding_mode, + const char *options, + hsa_ext_program_t *program); + +/** + * @brief Destroy a HSAIL program. + * + * @details The HSAIL program handle becomes invalid after it has been + * destroyed. Code object handles produced by ::hsa_ext_program_finalize are + * still valid after the HSAIL program has been destroyed, and can be used as + * intended. Resources allocated outside and associated with the HSAIL program + * (such as HSAIL modules that are added to the HSAIL program) can be released + * after the finalization program has been destroyed. + * + * @param[in] program HSAIL program. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_EXT_STATUS_ERROR_INVALID_PROGRAM The HSAIL program is + * invalid. + */ +hsa_status_t HSA_API hsa_ext_program_destroy( + hsa_ext_program_t program); + +/** + * @brief Add a HSAIL module to an existing HSAIL program. + * + * @details The HSA runtime does not perform a deep copy of the HSAIL module + * upon addition. Instead, it stores a pointer to the HSAIL module. The + * ownership of the HSAIL module belongs to the application, which must ensure + * that @p module is not released before destroying the HSAIL program. + * + * The HSAIL module is successfully added to the HSAIL program if @p module is + * valid, if all the declarations and definitions for the same symbol are + * compatible, and if @p module specify machine model and profile that matches + * the HSAIL program. + * + * @param[in] program HSAIL program. + * + * @param[in] module HSAIL module. The application can add the same HSAIL module + * to @p program at most once. The HSAIL module must specify the same machine + * model and profile as @p program. If the floating-mode rounding mode of @p + * module is not default, then it should match that of @p program. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES There is a failure to allocate + * resources required for the operation. + * + * @retval ::HSA_EXT_STATUS_ERROR_INVALID_PROGRAM The HSAIL program is invalid. + * + * @retval ::HSA_EXT_STATUS_ERROR_INVALID_MODULE The HSAIL module is invalid. + * + * @retval ::HSA_EXT_STATUS_ERROR_INCOMPATIBLE_MODULE The machine model of @p + * module does not match machine model of @p program, or the profile of @p + * module does not match profile of @p program. + * + * @retval ::HSA_EXT_STATUS_ERROR_MODULE_ALREADY_INCLUDED The HSAIL module is + * already a part of the HSAIL program. + * + * @retval ::HSA_EXT_STATUS_ERROR_SYMBOL_MISMATCH Symbol declaration and symbol + * definition compatibility mismatch. See the symbol compatibility rules in the + * HSA Programming Reference Manual. + */ +hsa_status_t HSA_API hsa_ext_program_add_module( + hsa_ext_program_t program, + hsa_ext_module_t module); + +/** + * @brief Iterate over the HSAIL modules in a program, and invoke an + * application-defined callback on every iteration. + * + * @param[in] program HSAIL program. + * + * @param[in] callback Callback to be invoked once per HSAIL module in the + * program. The HSA runtime passes three arguments to the callback: the program, + * a HSAIL module, and the application data. If @p callback returns a status + * other than ::HSA_STATUS_SUCCESS for a particular iteration, the traversal + * stops and ::hsa_ext_program_iterate_modules returns that status value. + * + * @param[in] data Application data that is passed to @p callback on every + * iteration. May be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_EXT_STATUS_ERROR_INVALID_PROGRAM The program is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL. + */ +hsa_status_t HSA_API hsa_ext_program_iterate_modules( + hsa_ext_program_t program, + hsa_status_t (*callback)(hsa_ext_program_t program, hsa_ext_module_t module, + void* data), + void* data); + +/** + * @brief HSAIL program attributes. + */ +typedef enum { + /** + * Machine model specified when the HSAIL program was created. The type + * of this attribute is ::hsa_machine_model_t. + */ + HSA_EXT_PROGRAM_INFO_MACHINE_MODEL = 0, + /** + * Profile specified when the HSAIL program was created. The type of + * this attribute is ::hsa_profile_t. + */ + HSA_EXT_PROGRAM_INFO_PROFILE = 1, + /** + * Default float rounding mode specified when the HSAIL program was + * created. The type of this attribute is ::hsa_default_float_rounding_mode_t. + */ + HSA_EXT_PROGRAM_INFO_DEFAULT_FLOAT_ROUNDING_MODE = 2 +} hsa_ext_program_info_t; + +/** + * @brief Get the current value of an attribute for a given HSAIL program. + * + * @param[in] program HSAIL program. + * + * @param[in] attribute Attribute to query. + * + * @param[out] value Pointer to an application-allocated buffer where to store + * the value of the attribute. If the buffer passed by the application is not + * large enough to hold the value of @p attribute, the behaviour is undefined. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_EXT_STATUS_ERROR_INVALID_PROGRAM The HSAIL program is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid + * HSAIL program attribute, or @p value is NULL. + */ +hsa_status_t HSA_API hsa_ext_program_get_info( + hsa_ext_program_t program, + hsa_ext_program_info_t attribute, + void *value); + +/** + * @brief Finalizer-determined call convention. + */ +typedef enum { + /** + * Finalizer-determined call convention. + */ + HSA_EXT_FINALIZER_CALL_CONVENTION_AUTO = -1 +} hsa_ext_finalizer_call_convention_t; + +/** + * @brief Control directives specify low-level information about the + * finalization process. + */ +typedef struct hsa_ext_control_directives_s { + /** + * Bitset indicating which control directives are enabled. The bit assigned to + * a control directive is determined by the corresponding value in + * BrigControlDirective. + * + * If a control directive is disabled, its corresponding field value (if any) + * must be 0. Control directives that are only present or absent (such as + * partial workgroups) have no corresponding field as the presence of the bit + * in this mask is sufficient. + */ + uint64_t control_directives_mask; + /** + * Bitset of HSAIL exceptions that must have the BREAK policy enabled. The bit + * assigned to an HSAIL exception is determined by the corresponding value + * in BrigExceptionsMask. If the kernel contains a enablebreakexceptions + * control directive, the finalizer uses the union of the two masks. + */ + uint16_t break_exceptions_mask; + /** + * Bitset of HSAIL exceptions that must have the DETECT policy enabled. The + * bit assigned to an HSAIL exception is determined by the corresponding value + * in BrigExceptionsMask. If the kernel contains a enabledetectexceptions + * control directive, the finalizer uses the union of the two masks. + */ + uint16_t detect_exceptions_mask; + /** + * Maximum size (in bytes) of dynamic group memory that will be allocated by + * the application for any dispatch of the kernel. If the kernel contains a + * maxdynamicsize control directive, the two values should match. + */ + uint32_t max_dynamic_group_size; + /** + * Maximum number of grid work-items that will be used by the application to + * launch the kernel. If the kernel contains a maxflatgridsize control + * directive, the value of @a max_flat_grid_size must not be greater than the + * value of the directive, and takes precedence. + * + * The value specified for maximum absolute grid size must be greater than or + * equal to the product of the values specified by @a required_grid_size. + * + * If the bit at position BRIG_CONTROL_MAXFLATGRIDSIZE is set in @a + * control_directives_mask, this field must be greater than 0. + */ + uint64_t max_flat_grid_size; + /** + * Maximum number of work-group work-items that will be used by the + * application to launch the kernel. If the kernel contains a + * maxflatworkgroupsize control directive, the value of @a + * max_flat_workgroup_size must not be greater than the value of the + * directive, and takes precedence. + * + * The value specified for maximum absolute grid size must be greater than or + * equal to the product of the values specified by @a required_workgroup_size. + * + * If the bit at position BRIG_CONTROL_MAXFLATWORKGROUPSIZE is set in @a + * control_directives_mask, this field must be greater than 0. + */ + uint32_t max_flat_workgroup_size; + /** + * Reserved. Must be 0. + */ + uint32_t reserved1; + /** + * Grid size that will be used by the application in any dispatch of the + * kernel. If the kernel contains a requiredgridsize control directive, the + * dimensions should match. + * + * The specified grid size must be consistent with @a required_workgroup_size + * and @a required_dim. Also, the product of the three dimensions must not + * exceed @a max_flat_grid_size. Note that the listed invariants must hold + * only if all the corresponding control directives are enabled. + * + * If the bit at position BRIG_CONTROL_REQUIREDGRIDSIZE is set in @a + * control_directives_mask, the three dimension values must be greater than 0. + */ + uint64_t required_grid_size[3]; + /** + * Work-group size that will be used by the application in any dispatch of the + * kernel. If the kernel contains a requiredworkgroupsize control directive, + * the dimensions should match. + * + * The specified work-group size must be consistent with @a required_grid_size + * and @a required_dim. Also, the product of the three dimensions must not + * exceed @a max_flat_workgroup_size. Note that the listed invariants must + * hold only if all the corresponding control directives are enabled. + * + * If the bit at position BRIG_CONTROL_REQUIREDWORKGROUPSIZE is set in @a + * control_directives_mask, the three dimension values must be greater than 0. + */ + hsa_dim3_t required_workgroup_size; + /** + * Number of dimensions that will be used by the application to launch the + * kernel. If the kernel contains a requireddim control directive, the two + * values should match. + * + * The specified dimensions must be consistent with @a required_grid_size and + * @a required_workgroup_size. This invariant must hold only if all the + * corresponding control directives are enabled. + * + * If the bit at position BRIG_CONTROL_REQUIREDDIM is set in @a + * control_directives_mask, this field must be 1, 2, or 3. + */ + uint8_t required_dim; + /** + * Reserved. Must be 0. + */ + uint8_t reserved2[75]; +} hsa_ext_control_directives_t; + +/** + * @brief Finalize an HSAIL program for a given instruction set architecture. + * + * @details Finalize all of the kernels and indirect functions that belong to + * the same HSAIL program for a specific instruction set architecture (ISA). The + * transitive closure of all functions specified by call or scall must be + * defined. Kernels and indirect functions that are being finalized must be + * defined. Kernels and indirect functions that are referenced in kernels and + * indirect functions being finalized may or may not be defined, but must be + * declared. All the global/readonly segment variables that are referenced in + * kernels and indirect functions being finalized may or may not be defined, but + * must be declared. + * + * @param[in] program HSAIL program. + * + * @param[in] isa Instruction set architecture to finalize for. + * + * @param[in] call_convention A call convention used in a finalization. Must + * have a value between ::HSA_EXT_FINALIZER_CALL_CONVENTION_AUTO (inclusive) + * and the value of the attribute ::HSA_ISA_INFO_CALL_CONVENTION_COUNT in @p + * isa (not inclusive). + * + * @param[in] control_directives Low-level control directives that influence + * the finalization process. + * + * @param[in] options Vendor-specific options. May be NULL. + * + * @param[in] code_object_type Type of code object to produce. + * + * @param[out] code_object Code object generated by the Finalizer, which + * contains the machine code for the kernels and indirect functions in the HSAIL + * program. The code object is independent of the HSAIL module that was used to + * generate it. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES There is a failure to allocate + * resources required for the operation. + * + * @retval ::HSA_EXT_STATUS_ERROR_INVALID_PROGRAM The HSAIL program is + * invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ISA @p isa is invalid. + * + * @retval ::HSA_EXT_STATUS_ERROR_DIRECTIVE_MISMATCH The directive in + * the control directive structure and in the HSAIL kernel mismatch, or if the + * same directive is used with a different value in one of the functions used by + * this kernel. + * + * @retval ::HSA_EXT_STATUS_ERROR_FINALIZATION_FAILED The Finalizer + * encountered an error while compiling a kernel or an indirect function. + */ +hsa_status_t HSA_API hsa_ext_program_finalize( + hsa_ext_program_t program, + hsa_isa_t isa, + int32_t call_convention, + hsa_ext_control_directives_t control_directives, + const char *options, + hsa_code_object_type_t code_object_type, + hsa_code_object_t *code_object); + +/** @} */ + +#define hsa_ext_finalizer_1_00 + +typedef struct hsa_ext_finalizer_1_00_pfn_s { + hsa_status_t (*hsa_ext_program_create)( + hsa_machine_model_t machine_model, hsa_profile_t profile, + hsa_default_float_rounding_mode_t default_float_rounding_mode, + const char *options, hsa_ext_program_t *program); + + hsa_status_t (*hsa_ext_program_destroy)(hsa_ext_program_t program); + + hsa_status_t (*hsa_ext_program_add_module)(hsa_ext_program_t program, + hsa_ext_module_t module); + + hsa_status_t (*hsa_ext_program_iterate_modules)( + hsa_ext_program_t program, + hsa_status_t (*callback)(hsa_ext_program_t program, + hsa_ext_module_t module, void *data), + void *data); + + hsa_status_t (*hsa_ext_program_get_info)( + hsa_ext_program_t program, hsa_ext_program_info_t attribute, + void *value); + + hsa_status_t (*hsa_ext_program_finalize)( + hsa_ext_program_t program, hsa_isa_t isa, int32_t call_convention, + hsa_ext_control_directives_t control_directives, const char *options, + hsa_code_object_type_t code_object_type, hsa_code_object_t *code_object); +} hsa_ext_finalizer_1_00_pfn_t; + +#ifdef __cplusplus +} // extern "C" block +#endif // __cplusplus + +#endif // HSA_RUNTIME_INC_HSA_EXT_FINALIZE_H_ diff --git a/runtime/hsa-runtime/inc/hsa_ext_image.h b/runtime/hsa-runtime/inc/hsa_ext_image.h new file mode 100644 index 0000000000..4bc9999e46 --- /dev/null +++ b/runtime/hsa-runtime/inc/hsa_ext_image.h @@ -0,0 +1,964 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef HSA_EXT_IMAGE_H +#define HSA_EXT_IMAGE_H + +#include "hsa.h" + +#undef HSA_API +#ifdef HSA_EXPORT_IMAGES +#define HSA_API HSA_API_EXPORT +#else +#define HSA_API HSA_API_IMPORT +#endif + +#ifdef __cplusplus +extern "C" { +#endif /*__cplusplus*/ + +/** \defgroup ext-images Images and Samplers + * @{ + */ + +/** + * @brief Image handle, populated by ::hsa_ext_image_create. Images + * handles are only unique within an agent, not across agents. + * + */ +typedef struct hsa_ext_image_s { + /** + * Opaque handle. + */ + uint64_t handle; + +} hsa_ext_image_t; + +/** + * @brief Geometry associated with the HSA image (image dimensions allowed in + * HSA). The enumeration values match the BRIG type BrigImageGeometry. + */ +typedef enum { + /** + * One-dimensional image addressed by width coordinate. + */ + HSA_EXT_IMAGE_GEOMETRY_1D = 0, + + /** + * Two-dimensional image addressed by width and height coordinates. + */ + HSA_EXT_IMAGE_GEOMETRY_2D = 1, + + /** + * Three-dimensional image addressed by width, height, and depth coordinates. + */ + HSA_EXT_IMAGE_GEOMETRY_3D = 2, + + /** + * Array of one-dimensional images with the same size and format. 1D arrays + * are addressed by index and width coordinate. + */ + HSA_EXT_IMAGE_GEOMETRY_1DA = 3, + + /** + * Array of two-dimensional images with the same size and format. 2D arrays + * are addressed by index and width and height coordinates. + */ + HSA_EXT_IMAGE_GEOMETRY_2DA = 4, + + /** + * One-dimensional image interpreted as a buffer with specific restrictions. + */ + HSA_EXT_IMAGE_GEOMETRY_1DB = 5, + + /** + * Two-dimensional depth image addressed by width and height coordinates. + */ + HSA_EXT_IMAGE_GEOMETRY_2DDEPTH = 6, + + /** + * Array of two-dimensional depth images with the same size and format. 2D + * arrays are addressed by index and width and height coordinates. + */ + HSA_EXT_IMAGE_GEOMETRY_2DADEPTH = 7 +} hsa_ext_image_geometry_t; + +/** + * @brief Channel type associated with the elements of an image. See the Image + * section in the HSA Programming Reference Manual for definitions on each + * component type. The enumeration values match the BRIG type + * BrigImageChannelType. + */ +typedef enum { + HSA_EXT_IMAGE_CHANNEL_TYPE_SNORM_INT8 = 0, + HSA_EXT_IMAGE_CHANNEL_TYPE_SNORM_INT16 = 1, + HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_INT8 = 2, + HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_INT16 = 3, + HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_INT24 = 4, + HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_SHORT_555 = 5, + HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_SHORT_565 = 6, + HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_SHORT_101010 = 7, + HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT8 = 8, + HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT16 = 9, + HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT32 = 10, + HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8 = 11, + HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16 = 12, + HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32 = 13, + HSA_EXT_IMAGE_CHANNEL_TYPE_HALF_FLOAT = 14, + HSA_EXT_IMAGE_CHANNEL_TYPE_FLOAT = 15 +} hsa_ext_image_channel_type_t; + +/** + * + * @brief Channel order associated with the elements of an image. See the + * Image section in the HSA Programming Reference Manual for definitions on each + * component order. The enumeration values match the BRIG type + * BrigImageChannelOrder. + */ +typedef enum { + HSA_EXT_IMAGE_CHANNEL_ORDER_A = 0, + HSA_EXT_IMAGE_CHANNEL_ORDER_R = 1, + HSA_EXT_IMAGE_CHANNEL_ORDER_RX = 2, + HSA_EXT_IMAGE_CHANNEL_ORDER_RG = 3, + HSA_EXT_IMAGE_CHANNEL_ORDER_RGX = 4, + HSA_EXT_IMAGE_CHANNEL_ORDER_RA = 5, + HSA_EXT_IMAGE_CHANNEL_ORDER_RGB = 6, + HSA_EXT_IMAGE_CHANNEL_ORDER_RGBX = 7, + HSA_EXT_IMAGE_CHANNEL_ORDER_RGBA = 8, + HSA_EXT_IMAGE_CHANNEL_ORDER_BGRA = 9, + HSA_EXT_IMAGE_CHANNEL_ORDER_ARGB = 10, + HSA_EXT_IMAGE_CHANNEL_ORDER_ABGR = 11, + HSA_EXT_IMAGE_CHANNEL_ORDER_SRGB = 12, + HSA_EXT_IMAGE_CHANNEL_ORDER_SRGBX = 13, + HSA_EXT_IMAGE_CHANNEL_ORDER_SRGBA = 14, + HSA_EXT_IMAGE_CHANNEL_ORDER_SBGRA = 15, + HSA_EXT_IMAGE_CHANNEL_ORDER_INTENSITY = 16, + HSA_EXT_IMAGE_CHANNEL_ORDER_LUMINANCE = 17, + HSA_EXT_IMAGE_CHANNEL_ORDER_DEPTH = 18, + HSA_EXT_IMAGE_CHANNEL_ORDER_DEPTH_STENCIL = 19 +} hsa_ext_image_channel_order_t; + +/** + * @brief Image format. + */ +typedef struct hsa_ext_image_format_s { + /** + * Channel type. + */ + hsa_ext_image_channel_type_t channel_type; + + /** + * Channel order. + */ + hsa_ext_image_channel_order_t channel_order; +} hsa_ext_image_format_t; + +/** + * @brief Implementation-independent image descriptor. + */ +typedef struct hsa_ext_image_descriptor_s { + /** + * Image geometry. + */ + hsa_ext_image_geometry_t geometry; + /** + * Width of the image, in components. + */ + size_t width; + /** + * Height of the image, in components. Only defined if the geometry is 2D or + * higher. + */ + size_t height; + /** + * Depth of the image, in components. Only defined if @a geometry is + * ::HSA_EXT_IMAGE_GEOMETRY_3D. A depth of 0 is same as a depth of 1. + */ + size_t depth; + /** + * Number of images in the image array. Only defined if @a geometry is + * ::HSA_EXT_IMAGE_GEOMETRY_1DA, ::HSA_EXT_IMAGE_GEOMETRY_2DA, or + * HSA_EXT_IMAGE_GEOMETRY_2DADEPTH. + */ + size_t array_size; + /** + * Image format. + */ + hsa_ext_image_format_t format; +} hsa_ext_image_descriptor_t; + +/** + * @brief Image capability. + */ +typedef enum { + /** + * Images of this geometry and format are not supported in the agent. + */ + HSA_EXT_IMAGE_CAPABILITY_NOT_SUPPORTED = 0x0, + /** + * Read-only images of this geometry and format are supported by the + * agent. + */ + HSA_EXT_IMAGE_CAPABILITY_READ_ONLY = 0x1, + /** + * Write-only images of this geometry and format are supported by the + * agent. + */ + HSA_EXT_IMAGE_CAPABILITY_WRITE_ONLY = 0x2, + /** + * Read-write images of this geometry and format are supported by the + * agent. + */ + HSA_EXT_IMAGE_CAPABILITY_READ_WRITE = 0x4, + /** + * Images of this geometry and format can be accessed from read-modify-write + * operations in the agent. + */ + HSA_EXT_IMAGE_CAPABILITY_READ_MODIFY_WRITE = 0x8, + /** + * Images of this geometry and format are guaranteed to have a consistent + * data layout regardless of how they are accessed by the associated + * agent. + */ + HSA_EXT_IMAGE_CAPABILITY_ACCESS_INVARIANT_DATA_LAYOUT = 0x10 +} hsa_ext_image_capability_t; + +/** + * @brief Retrieve the supported image capabilities for a given combination of + * agent, image format and geometry. + * + * @param[in] agent Agent to be associated with the image. + * + * @param[in] geometry Geometry. + * + * @param[in] image_format Pointer to an image format. Must not be NULL. + * + * @param[out] capability_mask Pointer to a memory location where the HSA + * runtime stores a bit-mask of supported image capability + * (::hsa_ext_image_capability_t) values. Must not be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p geometry is not a valid image + * geometry value, @p image_format is NULL, or @p capability_mask is NULL. + */ +hsa_status_t HSA_API + hsa_ext_image_get_capability(hsa_agent_t agent, + hsa_ext_image_geometry_t geometry, + const hsa_ext_image_format_t *image_format, + uint32_t *capability_mask); + +/** + * @brief Agent-specific image size and alignment requirements, populated by + * ::hsa_ext_image_data_get_info. + */ +typedef struct hsa_ext_image_data_info_s { + /** + * Image data size, in bytes. + */ + size_t size; + + /** + * Image data alignment, in bytes. + */ + size_t alignment; + +} hsa_ext_image_data_info_t; + +/** + * @brief Retrieve the image data requirements for a given combination of image + * descriptor, access permission, and agent. + * + * @details The optimal image data size and alignment requirements may vary + * depending on the image attributes specified in @p image_descriptor. Also, + * different implementation of the HSA runtime may return different requirements + * for the same input values. + * + * The implementation must return the same image data requirements for different + * access permissions with exactly the same image descriptor as long as + * ::hsa_ext_image_get_capability reports + * ::HSA_EXT_IMAGE_CAPABILITY_ACCESS_INVARIANT_DATA_LAYOUT for the geometry + * and image format contained in the image descriptor. + * + * @param[in] agent Agent to be associated with the image. + * + * @param[in] image_descriptor Pointer to an image descriptor. Must not be NULL. + * + * @param[in] access_permission Image access mode for @a agent. + * + * @param[out] image_data_info Memory location where the runtime stores the + * size and alignment requirements. Must not be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_EXT_STATUS_ERROR_IMAGE_FORMAT_UNSUPPORTED The agent does + * not support the image format specified by the descriptor. + * + * @retval ::HSA_EXT_STATUS_ERROR_IMAGE_SIZE_UNSUPPORTED The agent does + * not support the image dimensions specified by the format descriptor. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p image_descriptor is NULL, @p + * access_permission is not a valid access permission value, or @p + * image_data_info is NULL. + */ +hsa_status_t HSA_API hsa_ext_image_data_get_info( + hsa_agent_t agent, const hsa_ext_image_descriptor_t *image_descriptor, + hsa_access_permission_t access_permission, + hsa_ext_image_data_info_t *image_data_info); + +/** + * @brief Creates a agent-defined image handle from an + * implementation-independent image descriptor and a agent-specific image + * data. + * + * @details Image created with different access permissions but the same image + * descriptor can share the same image data if + * ::HSA_EXT_IMAGE_CAPABILITY_ACCESS_INVARIANT_DATA_LAYOUT is reported by + * ::hsa_ext_image_get_capability for the image format specified in the image + * descriptor. Images with a s-form channel order can share the same image data + * with other images that have the corresponding non-s-form channel order, + * provided the rest of their image descriptors are identical. + * + * If necessary, an application can use image operations (import, export, copy, + * clear) to prepare the image for the intended use regardless of the access + * permissions. + * + * @param[in] agent agent to be associated with the image. + * + * @param[in] image_descriptor Pointer to an image descriptor. Must not be NULL. + * + * @param[in] image_data Image data buffer that must have been allocated + * according to the size and alignment requirements dictated by + * ::hsa_ext_image_data_get_info. Must not be NULL. + * + * Any previous memory contents are preserved upon creation. The application is + * responsible for ensuring that the lifetime of the image data exceeds that of + * all the associated images. + * + * @param[in] access_permission Access permission of the image by the + * agent. The access permission defines how the agent expects to use the + * image and must match the corresponding HSAIL image handle type. The agent + * must support the image format specified in @p image_descriptor for the given + * permission. + * + * @param[out] image Pointer to a memory location where the HSA runtime stores + * the newly created image handle. Must not be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_EXT_STATUS_ERROR_IMAGE_FORMAT_UNSUPPORTED The agent does + * not have the capability to support the image format contained in the image + * descriptor using the specified access permission. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime cannot create the + * image because it is out of resources (for example, the agent does not + * support the creation of more image handles with the given access permission). + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p image_descriptor is NULL, @p + * image_data is NULL, @p access_permission is not a valid access permission + * value, or @p image is NULL. + */ +hsa_status_t HSA_API + hsa_ext_image_create(hsa_agent_t agent, + const hsa_ext_image_descriptor_t *image_descriptor, + const void *image_data, + hsa_access_permission_t access_permission, + hsa_ext_image_t *image); + +/** + * @brief Destroy an image previously created using ::hsa_ext_image_create. + * + * @details Destroying the image handle does not free the associated image data, + * or modify its contents. The application should not destroy an image while + * there are references to it queued for execution or currently being used in a + * kernel. + * + * @param[in] agent Agent associated with the image. + * + * @param[in] image Image. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + */ +hsa_status_t HSA_API + hsa_ext_image_destroy(hsa_agent_t agent, hsa_ext_image_t image); + +/** + * @brief Copies a portion of one image (the source) to another image (the + * destination). + * + * @details The source and destination image formats should match, except if the + * channel type of one of the images is the standard form of the channel type of + * the other image. For example, it is allowed to copy a source image with a + * channel type of HSA_EXT_IMAGE_CHANNEL_ORDER_SRGB to a destination image with + * a channel type of HSA_EXT_IMAGE_CHANNEL_ORDER_RGB. + * + * The source and destination images do not have to be of the same geometry and + * appropriate scaling is performed by the HSA runtime. It is possible to copy + * subregions between any combinations of source and destination types, provided + * that the dimensions of the subregions are the same. For example, it is + * allowed to copy a rectangular region from a 2D image to a slice of a 3D + * image. + * + * If the source and destination image data overlap, or the combination of + * offset and range references an out-out-bounds element in any of the images, + * the behavior is undefined. + * + * @param[in] agent Agent associated with both images. + * + * @param[in] src_image Source image. The agent associated with the source + * image must be identical to that of the destination image. + * + * @param[in] src_offset Pointer to the offset within the source image where to + * copy the data from. Must not be NULL. + * + * @param[in] dst_image Destination image. + * + * @param[in] dst_offset Pointer to the offset within the destination + * image where to copy the data. Must not be NULL. + * + * @param[in] range Dimensions of the image portion to be copied. The HSA + * runtime computes the size of the image data to be copied using this + * argument. Must not be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p src_offset is + * NULL, @p dst_offset is NULL, or @p range is NULL. + */ +hsa_status_t HSA_API + hsa_ext_image_copy(hsa_agent_t agent, hsa_ext_image_t src_image, + const hsa_dim3_t *src_offset, hsa_ext_image_t dst_image, + const hsa_dim3_t *dst_offset, const hsa_dim3_t *range); + +/** + * @brief Image region. + */ +typedef struct hsa_ext_image_region_s { + /** + * Offset within an image (in coordinates). + */ + hsa_dim3_t offset; + + /** + * Dimensions of the image range (in coordinates). The x, y, and z dimensions + * correspond to width, height, and depth respectively. + */ + hsa_dim3_t range; +} hsa_ext_image_region_t; + +/** + * @brief Import a linearly organized image data from memory directly to an + * image handle. + * + * @details This operation updates the image data referenced by the image handle + * from the source memory. The size of the data imported from memory is + * implicitly derived from the image region. + * + * If @p src_row_pitch is smaller than the destination region width (in bytes), + * then @p src_row_pitch = region width. + * + * If @p src_slice_pitch is smaller than the destination region width * region + * height (in bytes), then @p src_slice_pitch = region width * region height. + * + * It is the application's responsibility to avoid out of bounds memory access. + * + * None of the source memory or image data memory in the previously created + * ::hsa_ext_image_create image handle can overlap. Overlapping of any + * of the source and destination memory within the import operation produces + * undefined results. + * + * @param[in] agent Agent associated with the image. + * + * @param[in] src_memory Source memory. Must not be NULL. + * + * @param[in] src_row_pitch Number of bytes in one row of the source memory. + * + * @param[in] src_slice_pitch Number of bytes in one slice of the source memory. + * + * @param[in] dst_image Destination image. + * + * @param[in] image_region Pointer to the image region to be updated. Must not + * be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p src_memory is NULL, or @p + * image_region is NULL. + * + */ +hsa_status_t HSA_API + hsa_ext_image_import(hsa_agent_t agent, const void *src_memory, + size_t src_row_pitch, size_t src_slice_pitch, + hsa_ext_image_t dst_image, + const hsa_ext_image_region_t *image_region); + +/** + * @brief Export the image data to linearly organized memory. + * + * @details The operation updates the destination memory with the image data of + * @p src_image. The size of the data exported to memory is implicitly derived + * from the image region. + * + * If @p dst_row_pitch is smaller than the source region width (in bytes), then + * @p dst_row_pitch = region width. + * + * If @p dst_slice_pitch is smaller than the source region width * region height + * (in bytes), then @p dst_slice_pitch = region width * region height. + * + * It is the application's responsibility to avoid out of bounds memory access. + * + * None of the destination memory or image data memory in the previously created + * ::hsa_ext_image_create image handle can overlap. Overlapping of any of + * the source and destination memory within the export operation produces + * undefined results. + * + * @param[in] agent Agent associated with the image. + * + * @param[in] src_image Source image. + * + * @param[in] dst_memory Destination memory. Must not be NULL. + * + * @param[in] dst_row_pitch Number of bytes in one row of the destination + * memory. + * + * @param[in] dst_slice_pitch Number of bytes in one slice of the destination + * memory. + * + * @param[in] image_region Pointer to the image region to be exported. Must not + * be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p dst_memory is NULL, or @p + * image_region is NULL. + */ +hsa_status_t HSA_API + hsa_ext_image_export(hsa_agent_t agent, hsa_ext_image_t src_image, + void *dst_memory, size_t dst_row_pitch, + size_t dst_slice_pitch, + const hsa_ext_image_region_t *image_region); + +/** + * @brief Clear an image to the specified value. + * + * @details Clearing an image does not perform any format conversion and the + * provided clear data is directly stored regardless of the image format. The + * lowest bits of the data (number of bits depending on the image component + * type) stored in the cleared image are based on the image component order. + * + * The number of elements in @p data should match the number of access + * components for the channel order of @p image, as determined by the HSA + * Programmer's Reference Manual. A single element is required for + * HSA_EXT_IMAGE_CHANNEL_ORDER_DEPTH and + * HSA_EXT_IMAGE_CHANNEL_ORDER_DEPTH_STENCIL, while any other channel order + * requires 4 elements. + * + * Each element in @p data is a 32-bit value. The type of each element + * should match the access type associated with the channel type of @p image, + * as determined by the HSA Programmer's Reference Manual: + * - HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT8, + * HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT16, and + * HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT32 map to int32_t. + * - HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8, + * HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16, and + * HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32 map to uint32_t. + * - Any other channel type maps to a 32-bit float. + * + * @param[in] agent Agent associated with the image. + * + * @param[in] image Image to be cleared. + * + * @param[in] data Clear value array. Specifying a clear value outside of the + * range that can be represented by an image format results in undefined + * behavior. Must not be NULL. + * + * @param[in] image_region Pointer to the image region to clear. Must not be + * NULL. If the region references an out-out-bounds element, the behavior is + * undefined. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p data is NULL, or @p + * image_region is NULL. + */ +hsa_status_t HSA_API + hsa_ext_image_clear(hsa_agent_t agent, hsa_ext_image_t image, + const void *data, + const hsa_ext_image_region_t *image_region); + +/** + * @brief Sampler handle. Samplers are populated by + * ::hsa_ext_sampler_create. Sampler handles are only unique within an + * agent, not across agents. + */ +typedef struct hsa_ext_sampler_s { + /** + * Opaque handle. + */ + uint64_t handle; +} hsa_ext_sampler_t; + +/** + * @brief Sampler address modes. The sampler address mode describes the + * processing of out-of-range image coordinates. The values match the BRIG + * type BrigSamplerAddressing. + */ +typedef enum { + /** + * Out-of-range coordinates are not handled. + */ + HSA_EXT_SAMPLER_ADDRESSING_MODE_UNDEFINED = 0, + + /** + * Clamp out-of-range coordinates to the image edge. + */ + HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE = 1, + + /** + * Clamp out-of-range coordinates to the image border. + */ + HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_BORDER = 2, + + /** + * Wrap out-of-range coordinates back into the valid coordinate range. + */ + HSA_EXT_SAMPLER_ADDRESSING_MODE_REPEAT = 3, + + /** + * Mirror out-of-range coordinates back into the valid coordinate range. + */ + HSA_EXT_SAMPLER_ADDRESSING_MODE_MIRRORED_REPEAT = 4 + +} hsa_ext_sampler_addressing_mode_t; + +/** + * @brief Sampler coordinate modes. The enumeration values match the BRIG + * BRIG_SAMPLER_COORD bit in BrigSamplerModifier. + */ +typedef enum { + /** + * Coordinates are all in the range of 0 to (dimension-1). + */ + HSA_EXT_SAMPLER_COORDINATE_MODE_UNNORMALIZED = 0, + + /** + * Coordinates are all in the range of 0.0 to 1.0. + */ + HSA_EXT_SAMPLER_COORDINATE_MODE_NORMALIZED = 1 + +} hsa_ext_sampler_coordinate_mode_t; + +/** + * @brief Sampler filter modes. The enumeration values match the BRIG type + * BrigSamplerFilter. + */ +typedef enum { + /** + * Filter to the image element nearest (in Manhattan distance) to the + * specified coordinate. + */ + HSA_EXT_SAMPLER_FILTER_MODE_NEAREST = 0, + + /** + * Filter to the image element calculated by combining the elements in a 2x2 + * square block or 2x2x2 cube block around the specified coordinate. The + * elements are combined using linear interpolation. + */ + HSA_EXT_SAMPLER_FILTER_MODE_LINEAR = 1 + +} hsa_ext_sampler_filter_mode_t; + +/** + * @brief Implementation-independent sampler descriptor. + */ +typedef struct hsa_ext_sampler_descriptor_s { + /** + * Sampler coordinate mode describes the normalization of image coordinates. + */ + hsa_ext_sampler_coordinate_mode_t coordinate_mode; + + /** + * Sampler filter type describes the type of sampling performed. + */ + hsa_ext_sampler_filter_mode_t filter_mode; + + /** + * Sampler address mode describes the processing of out-of-range image + * coordinates. + */ + hsa_ext_sampler_addressing_mode_t address_mode; + +} hsa_ext_sampler_descriptor_t; + +/** + * @brief Create a kernel agent defined sampler handle for a given combination + * of a (agent-independent) sampler descriptor and agent. + * + * @param[in] agent Agent to be associated with the sampler. + * + * @param[in] sampler_descriptor Pointer to a sampler descriptor. Must not be + * NULL. + * + * @param[out] sampler Memory location where the HSA runtime stores the newly + * created sampler handle. Must not be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The agent cannot create the + * specified handle because it is out of resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p sampler_descriptor is NULL, or + * @p sampler is NULL. + */ +hsa_status_t HSA_API hsa_ext_sampler_create( + hsa_agent_t agent, const hsa_ext_sampler_descriptor_t *sampler_descriptor, + hsa_ext_sampler_t *sampler); + +/** + * @brief Destroy a sampler previously created using ::hsa_ext_sampler_create. + * + * @param[in] agent Agent associated with the sampler. + * + * @param[in] sampler Sampler. The sampler handle should not be destroyed while + * there are references to it queued for execution or currently being used in a + * dispatch. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + */ +hsa_status_t HSA_API + hsa_ext_sampler_destroy(hsa_agent_t agent, hsa_ext_sampler_t sampler); + +/** + * @brief Enumeration constants added to ::hsa_status_t by this extension. + */ +enum { + /** + * Image format is not supported. + */ + HSA_EXT_STATUS_ERROR_IMAGE_FORMAT_UNSUPPORTED = 0x3000, + /** + * Image size is not supported. + */ + HSA_EXT_STATUS_ERROR_IMAGE_SIZE_UNSUPPORTED = 0x3001 +}; + +/** + * @brief Enumeration constants added to ::hsa_agent_info_t by this + * extension. The value of any of these attributes is undefined if the + * agent is not a kernel agent, or the implementation does not support images. + */ +enum { + /** + * Maximum number of elements in 1D images. Must be at most 16384. The type + * of this attribute is uint32_t. + */ + HSA_EXT_AGENT_INFO_IMAGE_1D_MAX_ELEMENTS = 0x3000, + /** + * Maximum number of elements in 1DA images. Must be at most 16384. The type + * of this attribute is uint32_t. + */ + HSA_EXT_AGENT_INFO_IMAGE_1DA_MAX_ELEMENTS = 0x3001, + /** + * Maximum number of elements in 1DB images. Must be at most 65536. The type + * of this attribute is uint32_t. + */ + HSA_EXT_AGENT_INFO_IMAGE_1DB_MAX_ELEMENTS = 0x3002, + /** + * Maximum dimensions (width, height) of 2D images, in image elements. The X + * and Y maximums must be at most 16384. The type of this attribute is + * uint32_t[2]. + */ + HSA_EXT_AGENT_INFO_IMAGE_2D_MAX_ELEMENTS = 0x3003, + /** + * Maximum dimensions (width, height) of 2DA images, in image elements. The X + * and Y maximums must be at most 16384. The type of this attribute is + * uint32_t[2]. + */ + HSA_EXT_AGENT_INFO_IMAGE_2DA_MAX_ELEMENTS = 0x3004, + /** + * Maximum dimensions (width, height) of 2DDEPTH images, in image + * elements. The X and Y maximums must be at most 16384. The type of this + * attribute is uint32_t[2]. + */ + HSA_EXT_AGENT_INFO_IMAGE_2DDEPTH_MAX_ELEMENTS = 0x3005, + /** + * Maximum dimensions (width, height) of 2DADEPTH images, in image + * elements. The X and Y maximums must be at most 16384. The type of this + * attribute is uint32_t[2]. + */ + HSA_EXT_AGENT_INFO_IMAGE_2DADEPTH_MAX_ELEMENTS = 0x3006, + /** + * Maximum dimensions (width, height, depth) of 3D images, in image + * elements. The maximum along any dimension cannot exceed 2048. The type of + * this attribute is uint32_t[3]. + */ + HSA_EXT_AGENT_INFO_IMAGE_3D_MAX_ELEMENTS = 0x3007, + /** + * Maximum number of image layers in a image array. Must not exceed 2048. The + * type of this attribute is uint32_t. + */ + HSA_EXT_AGENT_INFO_IMAGE_ARRAY_MAX_LAYERS = 0x3008, + /** + * Maximum number of read-only image handles that can be created at any one + * time. Must be at least 128. The type of this attribute is uint32_t. + */ + HSA_EXT_AGENT_INFO_MAX_IMAGE_RD_HANDLES = 0x3009, + /** + * Maximum number of write-only and read-write image handles (combined) that + * can be created at any one time. Must be at least 64. The type of this + * attribute is uint32_t. + */ + HSA_EXT_AGENT_INFO_MAX_IMAGE_RORW_HANDLES = 0x300A, + /** + * Maximum number of sampler handlers that can be created at any one + * time. Must be at least 16. The type of this attribute is uint32_t. + */ + HSA_EXT_AGENT_INFO_MAX_SAMPLER_HANDLERS = 0x300B +}; + +/** @} */ + +#define hsa_ext_images_1_00 + +typedef struct hsa_ext_images_1_00_pfn_s { + hsa_status_t (*hsa_ext_image_get_capability)( + hsa_agent_t agent, hsa_ext_image_geometry_t geometry, + const hsa_ext_image_format_t *image_format, uint32_t *capability_mask); + + hsa_status_t (*hsa_ext_image_data_get_info)( + hsa_agent_t agent, const hsa_ext_image_descriptor_t *image_descriptor, + hsa_access_permission_t access_permission, + hsa_ext_image_data_info_t *image_data_info); + + hsa_status_t (*hsa_ext_image_create)( + hsa_agent_t agent, const hsa_ext_image_descriptor_t *image_descriptor, + const void *image_data, hsa_access_permission_t access_permission, + hsa_ext_image_t *image); + + hsa_status_t (*hsa_ext_image_destroy)(hsa_agent_t agent, + hsa_ext_image_t image); + + hsa_status_t (*hsa_ext_image_copy)(hsa_agent_t agent, + hsa_ext_image_t src_image, + const hsa_dim3_t *src_offset, + hsa_ext_image_t dst_image, + const hsa_dim3_t *dst_offset, + const hsa_dim3_t *range); + + hsa_status_t (*hsa_ext_image_import)( + hsa_agent_t agent, const void *src_memory, size_t src_row_pitch, + size_t src_slice_pitch, hsa_ext_image_t dst_image, + const hsa_ext_image_region_t *image_region); + + hsa_status_t (*hsa_ext_image_export)( + hsa_agent_t agent, hsa_ext_image_t src_image, void *dst_memory, + size_t dst_row_pitch, size_t dst_slice_pitch, + const hsa_ext_image_region_t *image_region); + + hsa_status_t (*hsa_ext_image_clear)( + hsa_agent_t agent, hsa_ext_image_t image, const void *data, + const hsa_ext_image_region_t *image_region); + + hsa_status_t (*hsa_ext_sampler_create)( + hsa_agent_t agent, const hsa_ext_sampler_descriptor_t *sampler_descriptor, + hsa_ext_sampler_t *sampler); + + hsa_status_t (*hsa_ext_sampler_destroy)(hsa_agent_t agent, + hsa_ext_sampler_t sampler); + +} hsa_ext_images_1_00_pfn_t; + +#ifdef __cplusplus +} // end extern "C" block +#endif /*__cplusplus*/ + +#endif diff --git a/runtime/hsa-runtime/libamdhsacode/CMakeLists.txt b/runtime/hsa-runtime/libamdhsacode/CMakeLists.txt new file mode 100644 index 0000000000..2b1573044c --- /dev/null +++ b/runtime/hsa-runtime/libamdhsacode/CMakeLists.txt @@ -0,0 +1,17 @@ +# +# amdhsacode library +# +# This file is expected to be included from top-level CMakeLists.txt. +# +# Dependencies: +# - Compiler definitions +# - elf library +# +# Defines: +# - amdhsacode library and target include directories + +file(GLOB sources *.cpp *.hpp) +find_package(LibElf REQUIRED) +add_library(amdhsacode ${sources}) +target_include_directories(amdhsacode PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) +target_link_libraries(amdhsacode elf) diff --git a/runtime/hsa-runtime/libamdhsacode/amd_elf_image.cpp b/runtime/hsa-runtime/libamdhsacode/amd_elf_image.cpp new file mode 100644 index 0000000000..b49a73199d --- /dev/null +++ b/runtime/hsa-runtime/libamdhsacode/amd_elf_image.cpp @@ -0,0 +1,1691 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#include "amd_elf_image.hpp" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "amd_hsa_code_util.hpp" +#ifdef _WIN32 +#include +#define alignof __alignof +#endif // _WIN32 +#include + +#ifndef _WIN32 +#define _open open +#define _close close +#define _read read +#define _write write +#define _lseek lseek +#define _ftruncate ftruncate +#define _tempnam tempnam +#include +#include +#include +#else +#define _ftruncate _chsize +#endif + +#if !defined(BSD_LIBELF) + #define elf_setshstrndx elfx_update_shstrndx +#endif + +#define NOTE_RECORD_ALIGNMENT 4 + +using amd::hsa::alignUp; + +namespace amd { + namespace elf { + + class FileImage { + public: + FileImage(); + ~FileImage(); + bool create(); + bool readFrom(const std::string& filename); + bool copyFrom(const void* data, size_t size); + bool writeTo(const std::string& filename); + bool copyTo(void** buffer, size_t* size = 0); + bool copyTo(void* buffer, size_t size); + size_t getSize(); + + std::string output() { return out.str(); } + + int fd() { return d; } + + private: + int d; + std::ostringstream out; + + bool error(const char* msg); + bool perror(const char *msg); + std::string werror(); + }; + + FileImage::FileImage() + : d(-1) + { + } + + FileImage::~FileImage() + { + if (d >= 0) { amd::hsa::CloseTempFile(d); } + } + + bool FileImage::error(const char* msg) + { + out << "Error: " << msg << std::endl; + return false; + } + + bool FileImage::perror(const char* msg) + { + out << "Error: " << msg << ": " << strerror(errno) << std::endl; + return false; + } + +#ifdef _WIN32 + std::string FileImage::werror() + { + LPVOID lpMsgBuf; + DWORD dw = GetLastError(); + + FormatMessage( + FORMAT_MESSAGE_ALLOCATE_BUFFER | + FORMAT_MESSAGE_FROM_SYSTEM | + FORMAT_MESSAGE_IGNORE_INSERTS, + NULL, + dw, + MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), + (LPTSTR)&lpMsgBuf, + 0, NULL); + std::string result((LPTSTR)lpMsgBuf); + LocalFree(lpMsgBuf); + return result; + } +#endif // _WIN32 + + bool FileImage::create() + { + d = amd::hsa::OpenTempFile("amdelf"); + if (d < 0) { return error("Failed to open temporary file for elf image"); } + return true; + } + + bool FileImage::readFrom(const std::string& filename) + { +#ifdef _WIN32 + std::unique_ptr buffer(new char[32 * 1024 * 1024]); + HANDLE in = CreateFile(filename.c_str(), GENERIC_READ, 0, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL); + if (in == INVALID_HANDLE_VALUE) { out << "Failed to open " << filename << ": " << werror() << std::endl; return false; } + DWORD read; + unsigned write; + int written; + do { + if (!ReadFile(in, buffer.get(), sizeof(buffer), &read, NULL)) { + out << "Failed to read " << filename << ": " << werror() << std::endl; + CloseHandle(in); + return false; + } + if (read > 0) { + write = read; + do { + written = _write(d, buffer.get(), write); + if (written < 0) { + out << "Failed to write image file: " << werror() << std::endl; + CloseHandle(in); + } + write -= written; + } while (write > 0); + } + } while (read > 0); + if (_lseek(d, 0L, SEEK_SET) < 0) { return perror("lseek(0) failed"); } + CloseHandle(in); + return true; +#else // _WIN32 + int in = _open(filename.c_str(), O_RDONLY); + if (in < 0) { return perror("open failed"); } + if (_lseek(in, 0L, SEEK_END) < 0) { return perror("lseek failed"); } + off_t size; + if ((size = _lseek(in, 0L, SEEK_CUR)) < 0) { return perror("lseek(2) failed"); } + if (_lseek(in, 0L, SEEK_SET) < 0) { return perror("lseek(3) failed"); } + if (_lseek(d, 0L, SEEK_SET) < 0) { return perror("lseek(3) failed"); } + ssize_t written; + do { + written = sendfile(d, in, NULL, size); + if (written < 0) { + _close(in); + return perror("sendfile failed"); + } + size -= written; + } while (size > 0); + _close(in); + if (_lseek(d, 0L, SEEK_SET) < 0) { return perror("lseek(0) failed"); } + return true; +#endif // _WIN32 + } + + bool FileImage::copyFrom(const void* data, size_t size) + { + assert(d != -1); + if (_lseek(d, 0L, SEEK_SET) < 0) { return perror("lseek failed"); } + if (_ftruncate(d, 0) < 0) { return perror("ftruncate failed"); } + int written, offset = 0; + while (size > 0) { + written = _write(d, (const char*) data + offset, size); + if (written < 0) { + return perror("write failed"); + } + size -= written; + offset += written; + } + if (_lseek(d, 0L, SEEK_SET) < 0) { return perror("lseek failed"); } + return true; + } + + size_t FileImage::getSize() + { + assert(d != -1); + if (_lseek(d, 0L, SEEK_END) < 0) { return perror("lseek failed"); } + long seek = 0; + if ((seek = _lseek(d, 0L, SEEK_CUR)) < 0) { return perror("lseek(2) failed"); } + if (_lseek(d, 0L, SEEK_SET) < 0) { return perror("lseek(3) failed"); } + return seek; + } + + bool FileImage::copyTo(void** buffer, size_t* size) + { + size_t size1 = getSize(); + void* buffer1 = malloc(size1); + if (_read(d, buffer1, size1) < 0) { free(buffer1); return perror("read failed"); } + *buffer = buffer1; + if (size) { *size = size1; } + return true; + } + + bool FileImage::copyTo(void* buffer, size_t size) + { + size_t size1 = getSize(); + if (size < size1) { return error("Buffer size is not enough"); } + if (_read(d, buffer, size1) < 0) { return perror("read failed"); } + return true; + } + + bool FileImage::writeTo(const std::string& filename) + { + bool res = false; + size_t size = 0; + void *buffer = nullptr; + if (copyTo(&buffer, &size)) { + res = true; + std::ofstream out(filename.c_str(), std::ios::binary); + out.write((char*)buffer, size); + } + free(buffer); + return res; + } + + class Buffer { + public: + typedef unsigned char byte_type; + typedef size_t size_type; + + Buffer(); + Buffer(const byte_type *src, size_type size, size_type align = 0); + virtual ~Buffer(); + + const byte_type* raw() const + { return this->isConst() ? ptr_ : data_.data(); } + size_type align() const + { return align_; } + size_type size() const + { return this->isConst() ? size_ : data_.size(); } + bool isConst() const + { return 0 != size_; } + bool isEmpty() + { return size() == 0; } + bool hasRaw(const byte_type *src) const + { return (src >= this->raw()) && (src < this->raw() + this->size()); } + template + bool has(const T *src) const + { return this->hasRaw((const byte_type*)src); } + bool has(size_type offset) const + { return offset < this->size(); } + + template + size_type getOffset(const T *src) const + { return this->getRawOffset((const byte_type*)src); } + template + T get(size_type offset) const + { return (T)this->getRaw(offset); } + size_type addString(const std::string &str, size_type align = 0); + size_type addStringLength(const std::string &str, size_type align = 0); + size_type nextOffset(size_type align) const { return alignUp(this->size(), align); } + template + size_type add(const T *src, size_type size, size_type align) + { return this->addRaw((const byte_type*)src, size, align); } + template + size_type add(const T &src, size_type align = 0) + { return this->addRaw((const byte_type*)&src, sizeof(T), align == 0 ? alignof(T) : align); } + size_type align(size_type align); + + template + size_type reserve() + { + Buffer::size_type offset = this->align(alignof(T)); + data_.insert(data_.end(), sizeof(T), 0x0); + return offset; + } + + private: + size_type getRawOffset(const byte_type *src) const; + const byte_type* getRaw(size_type offset) const; + size_type addRaw(const byte_type *src, size_type size, size_type align); + + std::vector data_; + const byte_type *ptr_; + size_type size_; + size_type align_; + }; + + Buffer::Buffer() + : ptr_(nullptr) + , size_(0) + , align_(0) + { + } + + Buffer::Buffer(const Buffer::byte_type *src, Buffer::size_type size, Buffer::size_type align) + : ptr_(src) + , size_(size) + , align_(align) + { + } + + Buffer::~Buffer() + { + } + + Buffer::size_type Buffer::getRawOffset(const Buffer::byte_type *src) const + { + assert(this->has(src)); + return src - this->raw(); + } + + const Buffer::byte_type* Buffer::getRaw(Buffer::size_type offset) const + { + assert(this->has(offset)); + return this->raw() + offset; + } + + Buffer::size_type Buffer::addRaw(const Buffer::byte_type *src, Buffer::size_type size, Buffer::size_type align) + { + assert(!this->isConst()); + assert(nullptr != src); + assert(0 != size); + assert(0 != align); + Buffer::size_type offset = this->align(align); + data_.insert(data_.end(), src, src + size); + return offset; + } + + Buffer::size_type Buffer::addString(const std::string &str, size_type align) + { + return this->add(str.c_str(), str.length() + 1, align == 0 ? alignof(char) : align); + } + + Buffer::size_type Buffer::addStringLength(const std::string &str, size_type align) + { + return this->add((uint32_t)(str.length() + 1), align == 0 ? alignof(uint32_t) : align); + } + + Buffer::size_type Buffer::align(Buffer::size_type align) + { + assert(!this->isConst()); + assert(0 != align); + Buffer::size_type offset = alignUp(this->size(), align); + align_ = (std::max)(align_, align); + data_.insert(data_.end(), offset - this->size(), 0x0); + return offset; + } + + class GElfImage; + class GElfSegment; + + class GElfSection : public virtual Section { + public: + GElfSection(GElfImage* elf); + + bool push(const char* name, uint32_t shtype, uint64_t shflags, uint16_t shlink, uint32_t info, uint32_t align, uint64_t entsize = 0); + bool pull0(); + bool pull(uint16_t ndx); + virtual bool pullData() { return true; } + bool push(); + uint16_t getSectionIndex() const override; + uint32_t type() const override { return hdr.sh_type; } + std::string Name() const override; + uint64_t offset() const override { return hdr.sh_offset; } + uint64_t addr() const override { return hdr.sh_addr; } + bool updateAddr(uint64_t addr) override; + uint64_t addralign() const override { return data0.size() == 0 ? data.align() : data0.align(); } + uint64_t flags() const override { return hdr.sh_flags; } + uint64_t size() const override { return data0.size() == 0 ? data.size() : data0.size(); } + uint64_t nextDataOffset(uint64_t align) const override; + uint64_t addData(const void *src, uint64_t size, uint64_t align) override; + bool getData(uint64_t offset, void* dest, uint64_t size) override; + bool hasRelocationSection() const override { return reloc_sec != 0; } + RelocationSection* relocationSection(SymbolTable* symtab = 0) override; + Segment* segment() override { return seg; } + RelocationSection* asRelocationSection() override { return 0; } + bool setMemSize(uint64_t s) { memsize_ = s; return true; } + uint64_t memSize() const override { return memsize_ ? memsize_ : size(); } + bool setAlign(uint64_t a) { align_ = a; return true; } + uint64_t memAlign() const override { return align_ ? align_ : addralign(); } + + protected: + GElfImage* elf; + Segment* seg; + GElf_Shdr hdr; + Buffer data0, data; + uint64_t memsize_; + uint64_t align_; + RelocationSection *reloc_sec; + + size_t ndxscn; + + friend class GElfSymbol; + friend class GElfSegment; + friend class GElfImage; + }; + + class GElfSegment : public Segment { + public: + GElfSegment(GElfImage* elf, uint16_t index); + GElfSegment(GElfImage* elf, uint16_t index, uint32_t type, uint32_t flags, uint64_t paddr = 0); + bool push(uint64_t vaddr); + bool pull(); + uint64_t type() const override { return phdr.p_type; } + uint64_t memSize() const override { return phdr.p_memsz; } + uint64_t align() const override { return phdr.p_align; } + uint64_t imageSize() const override { return phdr.p_filesz; } + uint64_t vaddr() const override { return phdr.p_vaddr; } + uint64_t flags() const override { return phdr.p_flags; } + const char* data() const override; + uint16_t getSegmentIndex() override; + bool updateAddSection(Section *section) override; + + private: + GElfImage* elf; + uint16_t index; + GElf_Phdr phdr; + std::vector sections; + }; + + class GElfStringTable : public GElfSection, public StringTable { + public: + GElfStringTable(GElfImage* elf); + bool push(const char* name, uint32_t shtype, uint64_t shflags); + bool pullData() override; + const char* addString(const std::string& s) override; + size_t addString1(const std::string& s); + const char* getString(size_t ndx) override; + size_t getStringIndex(const char* name) override; + + uint16_t getSectionIndex() const override { return GElfSection::getSectionIndex(); } + uint32_t type() const override { return GElfSection::type(); } + std::string Name() const override { return GElfSection::Name(); } + uint64_t addr() const override { return GElfSection::addr(); } + uint64_t offset() const override { return GElfSection::offset(); } + bool updateAddr(uint64_t addr) override { return GElfSection::updateAddr(addr); } + uint64_t addralign() const override { return GElfSection::addralign(); } + uint64_t flags() const override { return GElfSection::flags(); } + uint64_t size() const override { return GElfSection::size(); } + Segment* segment() override { return GElfSection::segment(); } + uint64_t nextDataOffset(uint64_t align) const override { return GElfSection::nextDataOffset(align); } + uint64_t addData(const void *src, uint64_t size, uint64_t align) override { return GElfSection::addData(src, size, align); } + bool getData(uint64_t offset, void* dest, uint64_t size) override { return GElfSection::getData(offset, dest, size); } + bool hasRelocationSection() const override { return GElfSection::hasRelocationSection(); } + RelocationSection* relocationSection(SymbolTable* symtab) override { return GElfSection::relocationSection(); } + RelocationSection* asRelocationSection() override { return 0; } + uint64_t memSize() const override { return GElfSection::memSize(); } + bool setMemSize(uint64_t s) override { return GElfSection::setMemSize(s); } + uint64_t memAlign() const override { return GElfSection::memAlign(); } + bool setAlign(uint64_t a) override { return GElfSection::setAlign(a); } + }; + + class GElfSymbolTable; + + class GElfSymbol : public Symbol { + public: + GElfSymbol(GElfSymbolTable* symtab, Buffer &data, size_t index); + + bool push(const std::string& name, uint64_t value, uint64_t size, unsigned char type, unsigned char binding, uint16_t shndx, unsigned char other); + + uint32_t index() override { return eindex / sizeof(GElf_Rela); } + uint32_t type() override { return GELF_ST_TYPE(Sym()->st_info); } + uint32_t binding() { return GELF_ST_BIND(Sym()->st_info); } + uint64_t size() { return Sym()->st_size; } + uint64_t value() { return Sym()->st_value; } + unsigned char other() { return Sym()->st_other; } + std::string name() override; + Section* section(); + + void setValue(uint64_t value) override { Sym()->st_value = value; } + void setSize(uint64_t size) override { Sym()->st_size = size; } + + private: + GElf_Sym* Sym() { return edata.get(eindex); } + GElfSymbolTable* symtab; + Buffer &edata; + size_t eindex; + friend class GElfSymbolTable; + }; + + class GElfSymbolTable : public GElfSection, public SymbolTable { + private: + Symbol* addSymbolInternal(Section* section, const std::string& name, uint64_t value, uint64_t size, unsigned char type, unsigned char binding, unsigned char other = 0); + + GElfStringTable* strtab; + std::vector> symbols; + friend class GElfSymbol; + + public: + GElfSymbolTable(GElfImage* elf); + bool push(const char* name, GElfStringTable* strtab); + bool pullData() override; + uint16_t getSectionIndex() const override { return GElfSection::getSectionIndex(); } + uint32_t type() const override { return GElfSection::type(); } + std::string Name() const override { return GElfSection::Name(); } + uint64_t offset() const override { return GElfSection::offset(); } + uint64_t addr() const override { return GElfSection::addr(); } + bool updateAddr(uint64_t addr) override { return GElfSection::updateAddr(addr); } + uint64_t addralign() const override { return GElfSection::addralign(); } + uint64_t flags() const override { return GElfSection::flags(); } + uint64_t size() const override { return GElfSection::size(); } + Segment* segment() override { return GElfSection::segment(); } + uint64_t nextDataOffset(uint64_t align) const override { return GElfSection::nextDataOffset(align); } + uint64_t addData(const void *src, uint64_t size, uint64_t align) override { return GElfSection::addData(src, size, align); } + bool getData(uint64_t offset, void* dest, uint64_t size) override { return GElfSection::getData(offset, dest, size); } + bool hasRelocationSection() const override { return GElfSection::hasRelocationSection(); } + RelocationSection* relocationSection(SymbolTable* symtab) override { return GElfSection::relocationSection(); } + Symbol* addSymbol(Section* section, const std::string& name, uint64_t value, uint64_t size, unsigned char type, unsigned char binding, unsigned char other = 0) override; + size_t symbolCount() override; + Symbol* symbol(size_t i) override; + RelocationSection* asRelocationSection() override { return 0; } + uint64_t memSize() const override { return GElfSection::memSize(); } + bool setMemSize(uint64_t s) override { return GElfSection::setMemSize(s); } + uint64_t memAlign() const override { return GElfSection::memAlign(); } + bool setAlign(uint64_t a) override { return GElfSection::setAlign(a); } + }; + + class GElfNoteSection : public GElfSection, public NoteSection { + public: + GElfNoteSection(GElfImage* elf); + bool push(const std::string& name); + uint16_t getSectionIndex() const override { return GElfSection::getSectionIndex(); } + uint32_t type() const override { return GElfSection::type(); } + std::string Name() const override { return GElfSection::Name(); } + uint64_t addr() const override { return GElfSection::addr(); } + bool updateAddr(uint64_t addr) override { return GElfSection::updateAddr(addr); } + uint64_t offset() const override { return GElfSection::offset(); } + uint64_t addralign() const override { return GElfSection::addralign(); } + uint64_t flags() const override { return GElfSection::flags(); } + uint64_t size() const override { return GElfSection::size(); } + Segment* segment() override { return GElfSection::segment(); } + uint64_t nextDataOffset(uint64_t align) const override { return GElfSection::nextDataOffset(align); } + uint64_t addData(const void *src, uint64_t size, uint64_t align) override { return GElfSection::addData(src, size, align); } + bool getData(uint64_t offset, void* dest, uint64_t size) override { return GElfSection::getData(offset, dest, size); } + bool hasRelocationSection() const override { return GElfSection::hasRelocationSection(); } + RelocationSection* relocationSection(SymbolTable* symtab) override { return GElfSection::relocationSection(); } + bool addNote(const std::string& name, uint32_t type, const void* desc, uint32_t desc_size) override; + bool getNote(const std::string& name, uint32_t type, void** desc, uint32_t* desc_size) override; + RelocationSection* asRelocationSection() override { return 0; } + uint64_t memSize() const override { return GElfSection::memSize(); } + bool setMemSize(uint64_t s) override { return GElfSection::setMemSize(s); } + uint64_t memAlign() const override { return GElfSection::memAlign(); } + bool setAlign(uint64_t a) override { return GElfSection::setAlign(a); } + }; + + class GElfRelocationSection; + + class GElfRelocation : public Relocation { + private: + GElf_Rela *Rela() { return edata.get(eindex); } + + GElfRelocationSection* rsection; + Buffer &edata; + size_t eindex; + + public: + GElfRelocation(GElfRelocationSection* rsection_, Buffer &edata_, size_t eindex_) + : rsection(rsection_), + edata(edata_), eindex(eindex_) + { + } + + bool push(uint32_t type, Symbol* symbol, uint64_t offset, int64_t addend); + + RelocationSection* section() override; + uint32_t type() override { return GELF_R_TYPE(Rela()->r_info); } + uint32_t symbolIndex() override { return GELF_R_SYM(Rela()->r_info); } + Symbol* symbol() override; + uint64_t offset() override { return Rela()->r_offset; } + int64_t addend() override { return Rela()->r_addend; } + }; + + class GElfRelocationSection : public GElfSection, public RelocationSection { + private: + Section* section; + GElfSymbolTable* symtab; + std::vector> relocations; + + public: + GElfRelocationSection(GElfImage* elf, Section* targetSection = 0, GElfSymbolTable* symtab_ = 0); + bool push(const std::string& name); + bool pullData() override; + uint16_t getSectionIndex() const override { return GElfSection::getSectionIndex(); } + uint32_t type() const override { return GElfSection::type(); } + std::string Name() const override { return GElfSection::Name(); } + uint64_t addr() const override { return GElfSection::addr(); } + uint64_t offset() const override { return GElfSection::offset(); } + bool updateAddr(uint64_t addr) override { return GElfSection::updateAddr(addr); } + uint64_t addralign() const override { return GElfSection::addralign(); } + uint64_t flags() const override { return GElfSection::flags(); } + uint64_t size() const override { return GElfSection::size(); } + Segment* segment() override { return GElfSection::segment(); } + uint64_t nextDataOffset(uint64_t align) const override { return GElfSection::nextDataOffset(align); } + uint64_t addData(const void *src, uint64_t size, uint64_t align) override { return GElfSection::addData(src, size, align); } + bool getData(uint64_t offset, void* dest, uint64_t size) override { return GElfSection::getData(offset, dest, size); } + bool hasRelocationSection() const override { return GElfSection::hasRelocationSection(); } + RelocationSection* relocationSection(SymbolTable* symtab) override { return GElfSection::relocationSection(); } + RelocationSection* asRelocationSection() override { return this; } + + size_t relocationCount() const override { return relocations.size(); } + Relocation* relocation(size_t i) override { return relocations[i].get(); } + Relocation* addRelocation(uint32_t type, Symbol* symbol, uint64_t offset, int64_t addend) override; + Section* targetSection() override { return section; } + uint64_t memSize() const override { return GElfSection::memSize(); } + bool setMemSize(uint64_t s) override { return GElfSection::setMemSize(s); } + uint64_t memAlign() const override { return GElfSection::memAlign(); } + bool setAlign(uint64_t a) override { return GElfSection::setAlign(a); } + friend class GElfRelocation; + }; + + class GElfImage : public Image { + public: + GElfImage(int elfclass); + ~GElfImage(); + bool initNew(uint16_t machine, uint16_t type, uint8_t os_abi = 0, uint8_t abi_version = 0, uint32_t e_flags = 0) override; + bool loadFromFile(const std::string& filename) override; + bool saveToFile(const std::string& filename) override; + bool initFromBuffer(const void* buffer, size_t size); + bool initAsBuffer(const void* buffer, size_t size); + bool close(); + bool writeTo(const std::string& filename) override; + bool copyToBuffer(void** buf, size_t* size = 0) override; + bool copyToBuffer(void* buf, size_t size) override; + + const char* data() override { assert(buffer); return buffer; } + uint64_t size() override; + + bool push(); + + bool Freeze() override; + bool Validate() override; + + uint16_t Machine() override { return ehdr.e_machine; } + uint16_t Type() override { return ehdr.e_type; } + + GElfStringTable* shstrtab(); + GElfStringTable* strtab(); + GElfSymbolTable* getSymtab(uint16_t index) + { + return static_cast(section(index)); + } + + GElfStringTable* addStringTable(const std::string& name) override; + GElfStringTable* getStringTable(uint16_t index) override; + + GElfSymbolTable* addSymbolTable(const std::string& name, StringTable* stab = 0) override; + GElfSymbolTable* symtab(); + + GElfSegment* segment(size_t i) override { return segments[i].get(); } + Segment* segmentByVAddr(uint64_t vaddr) override; + size_t sectionCount() override { return sections.size(); } + GElfSection* section(size_t i) override { return sections[i].get(); } + Section* sectionByVAddr(uint64_t vaddr) override; + uint16_t machine() const; + uint16_t etype() const; + int eclass() const { return elfclass; } + bool elfError(const char* msg); + + GElfNoteSection* note() override; + GElfNoteSection* addNoteSection(const std::string& name) override; + + size_t segmentCount() override { return segments.size(); } + Segment* initSegment(uint32_t type, uint32_t flags, uint64_t paddr = 0) override; + bool addSegments() override; + + Section* addSection(const std::string &name, + uint32_t type, + uint64_t flags = 0, + uint64_t entsize = 0, + Segment* segment = 0) override; + + RelocationSection* addRelocationSection(Section* sec, SymbolTable* symtab); + RelocationSection* relocationSection(Section* sec, SymbolTable* symtab = 0) override; + + private: + bool frozen; + int elfclass; + FileImage img; + const char* buffer; + size_t bufferSize; + Elf* e; + GElf_Ehdr ehdr; + GElfStringTable* shstrtabSection; + GElfStringTable* strtabSection; + GElfSymbolTable* symtabSection; + GElfNoteSection* noteSection; + std::vector> segments; + std::vector> sections; + + bool imgError(); + const char *elfError(); + bool elfBegin(Elf_Cmd cmd); + bool elfEnd(); + bool push0(); + bool pullElf(); + + friend class GElfSection; + friend class GElfSymbolTable; + friend class GElfNoteSection; + friend class GElfRelocationSection; + friend class GElfSegment; + friend class GElfSymbol; + }; + + GElfSegment::GElfSegment(GElfImage* elf_, uint16_t index_) + : elf(elf_), + index(index_) + { + memset(&phdr, 0, sizeof(phdr)); + } + + GElfSegment::GElfSegment(GElfImage* elf_, uint16_t index_, + uint32_t type, uint32_t flags, uint64_t paddr) + : elf(elf_), + index(index_) + { + memset(&phdr, 0, sizeof(phdr)); + phdr.p_type = type; + phdr.p_flags = flags; + phdr.p_paddr = paddr; + } + + const char* GElfSegment::data() const + { + return (const char*) elf->data() + phdr.p_offset; + } + + bool GElfImage::Freeze() + { + assert(!frozen); + if (!push()) { return false; } + frozen = true; + return true; + } + + bool GElfImage::Validate() + { + if (ELFMAG0 != ehdr.e_ident[EI_MAG0] || + ELFMAG1 != ehdr.e_ident[EI_MAG1] || + ELFMAG2 != ehdr.e_ident[EI_MAG2] || + ELFMAG3 != ehdr.e_ident[EI_MAG3]) { + out << "Invalid ELF magic" << std::endl; + return false; + } + if (EV_CURRENT != ehdr.e_version) { + out << "Invalid ELF version" << std::endl; + return false; + } + return true; + } + + bool GElfSegment::push(uint64_t vaddr) + { + phdr.p_align = 0; + phdr.p_offset = 0; + if (!sections.empty()) { + phdr.p_offset = sections[0]->offset(); + } + for (Section* section : sections) { + phdr.p_align = (std::max)(phdr.p_align, section->memAlign()); + } + phdr.p_vaddr = alignUp(vaddr, (std::max)(phdr.p_align, (uint64_t) 1)); + phdr.p_filesz = 0; + phdr.p_memsz = 0; + for (Section* section : sections) { + phdr.p_memsz = alignUp(phdr.p_memsz, (std::max)(section->memAlign(), (uint64_t) 1)); + phdr.p_filesz = alignUp(phdr.p_filesz, (std::max)(section->memAlign(), (uint64_t) 1)); + if (!section->updateAddr(phdr.p_vaddr + phdr.p_memsz)) { return false; } + phdr.p_filesz += (section->type() == SHT_NOBITS) ? 0 : section->size(); + phdr.p_memsz += section->memSize(); + } + if (!gelf_update_phdr(elf->e, index, &phdr)) { return elf->elfError("gelf_update_phdr failed"); } + return true; + } + + bool GElfSegment::pull() + { + if (!gelf_getphdr(elf->e, index, &phdr)) { return elf->elfError("gelf_getphdr failed"); } + return true; + } + + uint16_t GElfSegment::getSegmentIndex() + { + return index; + } + + bool GElfSegment::updateAddSection(Section *section) + { + sections.push_back(section); + return true; + } + + GElfSection::GElfSection(GElfImage* elf_) + : elf(elf_), + memsize_(0), + align_(0), + reloc_sec(nullptr), + ndxscn(0) + { + } + + uint16_t GElfSection::getSectionIndex() const + { + return (uint16_t)ndxscn; + } + + std::string GElfSection::Name() const + { + return std::string(elf->shstrtab()->getString(hdr.sh_name)); + } + + bool GElfSection::updateAddr(uint64_t addr) + { + Elf_Scn *scn = elf_getscn(elf->e, ndxscn); + assert(scn); + if (!gelf_getshdr(scn, &hdr)) { return elf->elfError("gelf_get_shdr failed"); } + hdr.sh_addr = addr; + if (!gelf_update_shdr(scn, &hdr)) { return elf->elfError("gelf_update_shdr failed"); } + return true; + } + + bool GElfSection::push(const char* name, uint32_t shtype, uint64_t shflags, uint16_t shlink, uint32_t info, uint32_t align, uint64_t entsize) + { + Elf_Scn *scn = elf_newscn(elf->e); + if (!scn) { return false; } + ndxscn = elf_ndxscn(scn); + if (!gelf_getshdr(scn, &hdr)) { return elf->elfError("gelf_get_shdr failed"); } + align = (std::max)(align, (uint32_t) 8); + hdr.sh_name = elf->shstrtab()->addString1(name); + hdr.sh_type = shtype; + hdr.sh_flags = shflags; + hdr.sh_link = shlink; + hdr.sh_addr = 0; + hdr.sh_info = info; + hdr.sh_addralign = align; + hdr.sh_entsize = entsize; + if (!gelf_update_shdr(scn, &hdr)) { return elf->elfError("gelf_update_shdr failed"); } + return true; + } + + bool GElfSection::pull0() + { + Elf_Scn *scn = elf_getscn(elf->e, ndxscn); + if (!scn) { return false; } + if (!gelf_getshdr(scn, &hdr)) { return elf->elfError("gelf_get_shdr failed"); } + return true; + } + + bool GElfSection::pull(uint16_t ndx) + { + ndxscn = (size_t) ndx; + if (!pull0()) { return false; } + Elf_Scn *scn = elf_getscn(elf->e, ndx); + if (!scn) { return false; } + Elf_Data *edata0 = elf_getdata(scn, NULL); + if (edata0) { + data0 = Buffer((const Buffer::byte_type*)edata0->d_buf, edata0->d_size, edata0->d_align); + } + seg = elf->segmentByVAddr(hdr.sh_addr); + return true; + } + + bool GElfSection::push() + { + Elf_Scn *scn = elf_getscn(elf->e, ndxscn); + assert(scn); + Elf_Data *edata = nullptr; + edata = elf_newdata(scn); + if (!edata) { return elf->elfError("elf_newdata failed"); } + if (hdr.sh_type == SHT_NOBITS) { + edata->d_buf = 0; + edata->d_size = memsize_; + if (align_ != 0) { + edata->d_align = align_; + } + } else { + edata->d_buf = (void*)data.raw(); + edata->d_size = data.size(); + if (data.align() != 0) { + edata->d_align = data.align(); + } + } + edata->d_align = (std::max)(edata->d_align, (uint64_t) 8); + switch (hdr.sh_type) { + case SHT_RELA: + edata->d_type = ELF_T_RELA; + break; + case SHT_SYMTAB: + edata->d_type = ELF_T_SYM; + break; + default: + edata->d_type = ELF_T_BYTE; + break; + } + edata->d_version = EV_CURRENT; + if (!gelf_getshdr(scn, &hdr)) { return elf->elfError("gelf_get_shdr failed"); } + hdr.sh_size = edata->d_size; + hdr.sh_addralign = edata->d_align; + if (!gelf_update_shdr(scn, &hdr)) { return elf->elfError("gelf_update_shdr failed"); } + return true; + } + + uint64_t GElfSection::nextDataOffset(uint64_t align) const + { + return data.nextOffset(align); + } + + uint64_t GElfSection::addData(const void *src, uint64_t size, uint64_t align) + { + return data.add(src, size, align); + } + + bool GElfSection::getData(uint64_t offset, void* dest, uint64_t size) + { + Elf_Data* edata = 0; + uint64_t coffset = 0; + uint64_t csize = 0; + Elf_Scn *scn = elf_getscn(elf->e, ndxscn); + assert(scn); + if ((edata = elf_getdata(scn, edata)) != 0) { + if (coffset <= offset && offset <= coffset + edata->d_size) { + csize = (std::min)(size, edata->d_size - offset); + memcpy(dest, (const char*) edata->d_buf + offset - coffset, csize); + coffset += csize; + dest = (char*) dest + csize; + size -= csize; + if (!size) { return true; } + } + } + return false; + } + + RelocationSection* GElfSection::relocationSection(SymbolTable* symtab) + { + if (!reloc_sec) { + reloc_sec = elf->addRelocationSection(this, symtab); + } + return reloc_sec; + } + + GElfStringTable::GElfStringTable(GElfImage* elf) + : GElfSection(elf) + { + } + + bool GElfStringTable::push(const char* name, uint32_t shtype, uint64_t shflags) + { + if (!GElfSection::push(name, shtype, shflags, SHN_UNDEF, 0, 0)) { return false; } + return true; + } + + bool GElfStringTable::pullData() + { + return true; + } + + const char* GElfStringTable::addString(const std::string& s) + { + if (data0.size() == 0 && data.size() == 0) { + data.add('\0'); + } + return data.get(data.addString(s)); + } + + size_t GElfStringTable::addString1(const std::string& s) + { + if (data0.size() == 0 && data.size() == 0) { + data.add('\0'); + } + return data.addString(s); + } + + const char* GElfStringTable::getString(size_t ndx) + { + if (data0.has(ndx)) { return data0.get(ndx); } + else if (data.has(ndx)) { return data.get(ndx); } + return nullptr; + } + + size_t GElfStringTable::getStringIndex(const char* s) + { + if (data0.has(s)) { + return data0.getOffset(s); + } else if (data.has(s)) { + return data.getOffset(s); + } else { + assert(false); + return 0; + } + } + + GElfSymbol::GElfSymbol(GElfSymbolTable* symtab_, Buffer &data_, size_t index_) + : symtab(symtab_), + edata(data_), + eindex(index_) + { + } + + Section* GElfSymbol::section() + { + if (Sym()->st_shndx != SHN_UNDEF) { + return symtab->elf->section(Sym()->st_shndx); + } + return 0; + } + + bool GElfSymbol::push(const std::string& name, uint64_t value, uint64_t size, unsigned char type, unsigned char binding, uint16_t shndx, unsigned char other) + { + Sym()->st_name = symtab->strtab->addString1(name.c_str()); + Sym()->st_value = value; + Sym()->st_size = size; + Sym()->st_info = GELF_ST_INFO(binding, type); + Sym()->st_shndx = shndx; + Sym()->st_other = other; + return true; + } + + std::string GElfSymbol::name() + { + return symtab->strtab->getString(Sym()->st_name); + } + + GElfSymbolTable::GElfSymbolTable(GElfImage* elf) + : GElfSection(elf), + strtab(0) + { + } + + bool GElfSymbolTable::push(const char* name, GElfStringTable* strtab) + { + if (!strtab) { strtab = elf->strtab(); } + this->strtab = strtab; + if (!GElfSection::push(name, SHT_SYMTAB, 0, strtab->getSectionIndex(), 0, 0, sizeof(Elf64_Sym))) { return false; } + return true; + } + + bool GElfSymbolTable::pullData() + { + strtab = elf->getStringTable(hdr.sh_link); + for (size_t i = 0; i < data0.size() / sizeof(GElf_Sym); ++i) { + symbols.push_back(std::unique_ptr(new GElfSymbol(this, data0, i * sizeof(GElf_Sym)))); + } + return true; + } + + Symbol* GElfSymbolTable::addSymbolInternal(Section* section, const std::string& name, uint64_t value, uint64_t size, unsigned char type, unsigned char binding, unsigned char other) + { + GElfSymbol *sym = new (std::nothrow) GElfSymbol(this, data, data.reserve()); + uint16_t shndx = section ? section->getSectionIndex() : (uint16_t) SHN_UNDEF; + if (!sym->push(name, value, size, type, binding, shndx, other)) { + delete sym; + return nullptr; + } + symbols.push_back(std::unique_ptr(sym)); + return sym; + } + + Symbol* GElfSymbolTable::addSymbol(Section* section, const std::string& name, uint64_t value, uint64_t size, unsigned char type, unsigned char binding, unsigned char other) + { + if (symbols.size() == 0) { + this->addSymbolInternal(nullptr, "", 0, 0, 0, 0, 0); + } + return this->addSymbolInternal(section, name, value, size, type, binding, other); + } + + size_t GElfSymbolTable::symbolCount() + { + return symbols.size(); + } + + Symbol* GElfSymbolTable::symbol(size_t i) + { + return symbols[i].get(); + } + + GElfNoteSection::GElfNoteSection(GElfImage* elf) + : GElfSection(elf) + { + } + + bool GElfNoteSection::push(const std::string& name) + { + return GElfSection::push(name.c_str(), SHT_NOTE, 0, 0, 0, 8); + } + + bool GElfNoteSection::addNote(const std::string& name, uint32_t type, const void* desc, uint32_t desc_size) + { + data.addStringLength(name, NOTE_RECORD_ALIGNMENT); + data.add(desc_size, NOTE_RECORD_ALIGNMENT); + data.add(type, NOTE_RECORD_ALIGNMENT); + data.addString(name, NOTE_RECORD_ALIGNMENT); + data.align(NOTE_RECORD_ALIGNMENT); + if (desc_size > 0) { + assert(desc); + data.add(desc, desc_size, NOTE_RECORD_ALIGNMENT); + data.align(NOTE_RECORD_ALIGNMENT); + } + return true; + } + + bool GElfNoteSection::getNote(const std::string& name, uint32_t type, void** desc, uint32_t* desc_size) + { + Elf_Data* data = 0; + Elf_Scn *scn = elf_getscn(elf->e, ndxscn); + assert(scn); + while ((data = elf_getdata(scn, data)) != 0) { + uint32_t note_offset = 0; + while (note_offset < data->d_size) { + char* notec = (char *) data->d_buf + note_offset; + Elf64_Nhdr* note = (Elf64_Nhdr*) notec; + if (type == note->n_type) { + std::string note_name = GetNoteString(note->n_namesz, notec + sizeof(Elf64_Nhdr)); + if (name == note_name) { + *desc = notec + sizeof(Elf64_Nhdr) + alignUp(note->n_namesz, 4); + *desc_size = note->n_descsz; + return true; + } + } + note_offset += sizeof(Elf64_Nhdr) + alignUp(note->n_namesz, 4) + alignUp(note->n_descsz, 4); + } + } + return false; + } + + bool GElfRelocation::push(uint32_t type, Symbol* symbol, uint64_t offset, int64_t addend) + { + Rela()->r_info = GELF_R_INFO((uint64_t) symbol->index(), type); + Rela()->r_offset = offset; + Rela()->r_addend = addend; + return true; + } + + RelocationSection* GElfRelocation::section() + { + return rsection; + } + + Symbol* GElfRelocation::symbol() + { + return rsection->symtab->symbol(symbolIndex()); + } + + GElfRelocationSection::GElfRelocationSection(GElfImage* elf, Section* section_, GElfSymbolTable* symtab_) + : GElfSection(elf), + section(section_), + symtab(symtab_) + { + } + + bool GElfRelocationSection::push(const std::string& name) + { + return GElfSection::push(name.c_str(), SHT_RELA, 0, symtab->getSectionIndex(), section->getSectionIndex(), 0, sizeof(Elf64_Rela)); + } + + Relocation* GElfRelocationSection::addRelocation(uint32_t type, Symbol* symbol, uint64_t offset, int64_t addend) + { + GElfRelocation *rela = new (std::nothrow) GElfRelocation(this, data, data.reserve()); + if (!rela || !rela->push(type, symbol, offset, addend)) { + delete rela; + return nullptr; + } + relocations.push_back(std::unique_ptr(rela)); + return rela; + } + + bool GElfRelocationSection::pullData() + { + section = elf->section(hdr.sh_info); + symtab = elf->getSymtab(hdr.sh_link); + Elf_Scn *lScn = elf_getscn(elf->e, ndxscn); + assert(lScn); + Elf_Data *lData = elf_getdata(lScn, nullptr); + assert(lData); + data0 = Buffer((const Buffer::byte_type*)lData->d_buf, lData->d_size, lData->d_align); + for (size_t i = 0; i < data0.size() / sizeof(GElf_Rela); ++i) { + relocations.push_back(std::unique_ptr(new GElfRelocation(this, data0, i * sizeof(GElf_Rela)))); + } + return true; + } + + GElfImage::GElfImage(int elfclass_) + : frozen(true), + elfclass(elfclass_), + buffer(0), bufferSize(0), + e(0), + shstrtabSection(0), strtabSection(0), + symtabSection(0), + noteSection(0) + { + if (EV_NONE == elf_version(EV_CURRENT)) { + assert(false); + } + } + + GElfImage::~GElfImage() + { + elf_end(e); + } + + bool GElfImage::imgError() + { + out << img.output(); + return false; + } + + const char *GElfImage::elfError() + { + return elf_errmsg(-1); + } + + bool GElfImage::elfBegin(Elf_Cmd cmd) + { + if ((e = elf_begin(img.fd(), cmd, NULL +#ifdef AMD_LIBELF + , NULL +#endif + )) == NULL) { + out << "elf_begin failed: " << elfError() << std::endl; + return false; + } + return true; + } + + bool GElfImage::initNew(uint16_t machine, uint16_t type, uint8_t os_abi, uint8_t abi_version, uint32_t e_flags) + { + if (!img.create()) { return imgError(); } + if (!elfBegin(ELF_C_WRITE)) { return false; } + if (!gelf_newehdr(e, elfclass)) { return elfError("gelf_newehdr failed"); } + if (!gelf_getehdr(e, &ehdr)) { return elfError("gelf_getehdr failed"); } + ehdr.e_ident[EI_DATA] = ELFDATA2LSB; + ehdr.e_ident[EI_VERSION] = EV_CURRENT; + ehdr.e_ident[EI_OSABI] = os_abi; + ehdr.e_ident[EI_ABIVERSION] = abi_version; + ehdr.e_machine = machine; + ehdr.e_type = type; + ehdr.e_version = EV_CURRENT; + ehdr.e_flags = e_flags; + if (!gelf_update_ehdr(e, &ehdr)) { return elfError("gelf_updateehdr failed"); } + sections.push_back(std::unique_ptr()); + if (!shstrtab()->push(".shstrtab", SHT_STRTAB, SHF_STRINGS)) { return elfError("Failed to create shstrtab"); } + ehdr.e_shstrndx = shstrtab()->getSectionIndex(); + if (!gelf_update_ehdr(e, &ehdr)) { return elfError("gelf_updateehdr failed"); } + if (!strtab()->push(".strtab", SHT_STRTAB, SHF_STRINGS)) { return elfError("Failed to create strtab"); } + frozen = false; + return true; + } + + bool GElfImage::loadFromFile(const std::string& filename) + { + if (!img.create()) { return imgError(); } + if (!img.readFrom(filename)) { return imgError(); } + if (!elfBegin(ELF_C_RDWR)) { return false; } + return pullElf(); + } + + bool GElfImage::saveToFile(const std::string& filename) + { + if (buffer) { + std::ofstream out(filename.c_str(), std::ios::binary); + if (out.fail()) { return false; } + out.write(buffer, bufferSize); + return !out.fail(); + } else { + if (!push()) { return false; } + return img.writeTo(filename); + } + } + + bool GElfImage::initFromBuffer(const void* buffer, size_t size) + { + if (size == 0) { size = ElfSize(buffer); } + if (!img.create()) { return imgError(); } + if (!img.copyFrom(buffer, size)) { return imgError(); } + if (!elfBegin(ELF_C_RDWR)) { return false; } + return pullElf(); + } + + bool GElfImage::initAsBuffer(const void* buffer, size_t size) + { + if (size == 0) { size = ElfSize(buffer); } + if ((e = elf_memory(reinterpret_cast(const_cast(buffer)), size +#ifdef AMD_LIBELF + , NULL +#endif + )) == NULL) { + out << "elf_begin(buffer) failed: " << elfError() << std::endl; + return false; + } + this->buffer = reinterpret_cast(buffer); + this->bufferSize = size; + return pullElf(); + } + + bool GElfImage::pullElf() + { + if (!gelf_getehdr(e, &ehdr)) { return elfError("gelf_getehdr failed"); } + segments.reserve(ehdr.e_phnum); + for (size_t i = 0; i < ehdr.e_phnum; ++i) { + GElfSegment* segment = new GElfSegment(this, i); + segment->pull(); + segments.push_back(std::unique_ptr(segment)); + } + + shstrtabSection = new GElfStringTable(this); + if (!shstrtabSection->pull(ehdr.e_shstrndx)) { return false; } + Elf_Scn* scn = 0; + for (unsigned n = 0; n < ehdr.e_shnum; ++n) { + scn = elf_getscn(e, n); + if (n == ehdr.e_shstrndx) { + sections.push_back(std::unique_ptr(shstrtabSection)); + continue; + } + GElf_Shdr shdr; + if (!gelf_getshdr(scn, &shdr)) { return elfError("Failed to get shdr"); } + GElfSection* section = 0; + if (shdr.sh_type == SHT_NOTE) { + section = new GElfNoteSection(this); + } else if (shdr.sh_type == SHT_RELA) { + section = new GElfRelocationSection(this); + } else if (shdr.sh_type == SHT_STRTAB) { + section = new GElfStringTable(this); + } else if (shdr.sh_type == SHT_SYMTAB) { + section = new GElfSymbolTable(this); + } else if (shdr.sh_type == SHT_NULL) { + section = 0; + sections.push_back(std::unique_ptr()); + } else { + section = new GElfSection(this); + } + if (section) { + sections.push_back(std::unique_ptr(section)); + if (!section->pull(n)) { return false; } + } + } + + for (size_t n = 1; n < sections.size(); ++n) { + GElfSection* section = sections[n].get(); + if (section->type() == SHT_STRTAB) { + if (!section->pullData()) { return false; } + } + } + + for (size_t n = 1; n < sections.size(); ++n) { + GElfSection* section = sections[n].get(); + if (section->type() == SHT_SYMTAB) { + if (!section->pullData()) { return false; } + } + } + + for (size_t n = 1; n < sections.size(); ++n) { + GElfSection* section = sections[n].get(); + if (section->type() != SHT_STRTAB && section->type() != SHT_SYMTAB) { + if (!section->pullData()) { return false; } + } + } + + for (size_t i = 1; i < sections.size(); ++i) { + if (i == ehdr.e_shstrndx || i == ehdr.e_shstrndx) { continue; } + std::unique_ptr& section = sections[i]; + if (section->Name() == ".strtab") { strtabSection = static_cast(section.get()); } + if (section->Name() == ".symtab") { symtabSection = static_cast(section.get()); } + if (section->Name() == ".note") { noteSection = static_cast(section.get()); } + } + + size_t phnum; + if (elf_getphdrnum(e, &phnum) < 0) { return elfError("elf_getphdrnum failed"); } + for (size_t i = 0; i < phnum; ++i) { + segments.push_back(std::unique_ptr(new GElfSegment(this, i))); + if (!segments[i]->pull()) { return false; } + } + + return true; + } + + bool GElfImage::elfError(const char* msg) + { + out << "Error: " << msg << ": " << elfError() << std::endl; + return false; + } + + uint64_t GElfImage::size() + { + if (buffer) { + return ElfSize(buffer); + } else { + return img.getSize(); + } + } + + bool GElfImage::push0() + { + assert(e); + for (std::unique_ptr& section : sections) { + if (section && !section->push()) { return false; } + } + + for (std::unique_ptr& section : sections) { + if (section && !section->pull0()) { return false; } + } + + if (!segments.empty()) { + if (!gelf_newphdr(e, segments.size())) { return elfError("gelf_newphdr failed"); } + } + if (elf_update(e, ELF_C_NULL) < 0) { return elfError("elf_update (1.1) failed"); } + if (!segments.empty()) { + for (std::unique_ptr& section : sections) { + // Update section offsets. + if (section && !section->pull0()) { return false; } + } + uint64_t vaddr = 0; + for (std::unique_ptr& segment : segments) { + if (!segment->push(vaddr)) { return false; } + vaddr = segment->vaddr() + segment->memSize(); + } + } + return true; + } + + bool GElfImage::push() + { + if (!push0()) { return false; } + if (elf_update(e, ELF_C_WRITE) < 0) { return elfError("elf_update (2) failed"); } + return true; + } + + Segment* GElfImage::segmentByVAddr(uint64_t vaddr) + { + for (std::unique_ptr& seg : segments) { + if (seg->vaddr() <= vaddr && vaddr < seg->vaddr() + seg->memSize()) { + return seg.get(); + } + } + return 0; + } + + Section* GElfImage::sectionByVAddr(uint64_t vaddr) + { + for (size_t n = 1; n < sections.size(); ++n) { + if (sections[n]->addr() <= vaddr && vaddr < sections[n]->addr() + sections[n]->size()) { + return sections[n].get(); + } + } + return nullptr; + } + + bool GElfImage::elfEnd() + { + return false; + } + + bool GElfImage::writeTo(const std::string& filename) + { + if (!img.writeTo(filename)) { return imgError(); } + return true; + } + + bool GElfImage::copyToBuffer(void** buf, size_t* size) + { + if (buffer) { + *buf = malloc(bufferSize); + memcpy(*buf, buffer, bufferSize); + if (size) { *size = bufferSize; } + return true; + } else { + return img.copyTo(buf, size); + } + } + + bool GElfImage::copyToBuffer(void* buf, size_t size) + { + if (buffer) { + if (size < bufferSize) { return false; } + memcpy(buf, buffer, bufferSize); + return true; + } else { + return img.copyTo(buf, size); + } + } + + GElfStringTable* GElfImage::addStringTable(const std::string& name) + { + GElfStringTable* stab = new GElfStringTable(this); + sections.push_back(std::unique_ptr(stab)); + return stab; + } + + GElfStringTable* GElfImage::getStringTable(uint16_t index) + { + return static_cast(sections[index].get()); + } + + GElfSymbolTable* GElfImage::addSymbolTable(const std::string& name, StringTable* stab) + { + if (!stab) { stab = strtab(); } + const char* name0 = shstrtab()->addString(name); + GElfSymbolTable* symtab = new GElfSymbolTable(this); + symtab->push(name0, static_cast(stab)); + sections.push_back(std::unique_ptr(symtab)); + return symtab; + } + + GElfStringTable* GElfImage::shstrtab() { + if (!shstrtabSection) { + shstrtabSection = addStringTable(".shstrtab"); + } + return shstrtabSection; + } + + GElfStringTable* GElfImage::strtab() { + if (!strtabSection) { + strtabSection = addStringTable(".shstrtab"); + } + return strtabSection; + } + + GElfSymbolTable* GElfImage::symtab() + { + if (!symtabSection) { + symtabSection = addSymbolTable(".symtab", strtab()); + } + return symtabSection; + } + + + GElfNoteSection* GElfImage::note() + { + if (!noteSection) { noteSection = addNoteSection(".note"); } + return noteSection; + } + + GElfNoteSection* GElfImage::addNoteSection(const std::string& name) + { + GElfNoteSection* note = new GElfNoteSection(this); + note->push(name); + sections.push_back(std::unique_ptr(note)); + return note; + } + + Segment* GElfImage::initSegment(uint32_t type, uint32_t flags, uint64_t paddr) + { + GElfSegment *seg = new (std::nothrow) GElfSegment(this, segments.size(), type, flags, paddr); + segments.push_back(std::unique_ptr(seg)); + return seg; + } + + bool GElfImage::addSegments() + { + return true; + } + + Section* GElfImage::addSection(const std::string &name, + uint32_t type, + uint64_t flags, + uint64_t entsize, Segment* segment) + { + GElfSection *section = new (std::nothrow) GElfSection(this); + if (!section || !section->push(name.c_str(), type, flags, 0, 0, 0, entsize)) { + delete section; + return nullptr; + } + if (segment) { + if (!segment->updateAddSection(section)) { + delete section; + return nullptr; + } + } + sections.push_back(std::unique_ptr(section)); + return section; + } + + RelocationSection* GElfImage::addRelocationSection(Section* sec, SymbolTable* symtab) + { + std::string section_name = ".rela" + sec->Name(); + if (!symtab) { symtab = this->symtab(); } + GElfRelocationSection *rsec = new GElfRelocationSection(this, sec, (GElfSymbolTable*) symtab); + if (!rsec || !rsec->push(section_name)) { + delete rsec; + return nullptr; + } + sections.push_back(std::unique_ptr(rsec)); + return rsec; + } + + RelocationSection* GElfImage::relocationSection(Section* sec, SymbolTable* symtab) + { + return sec->relocationSection(symtab); + } + + uint16_t GElfImage::machine() const + { + return ehdr.e_machine; + } + + uint16_t GElfImage::etype() const + { + return ehdr.e_type; + } + + Image* NewElf32Image() { return new GElfImage(ELFCLASS32); } + Image* NewElf64Image() { return new GElfImage(ELFCLASS64); } + + uint64_t ElfSize(const void* emi) + { + const Elf64_Ehdr *ehdr = (const Elf64_Ehdr*) emi; + if (NULL == ehdr || EV_CURRENT != ehdr->e_version) { + return false; + } + + const Elf64_Shdr *shdr = (const Elf64_Shdr*)((char*)emi + ehdr->e_shoff); + if (NULL == shdr) { + return false; + } + + uint64_t max_offset = ehdr->e_shoff; + uint64_t total_size = max_offset + ehdr->e_shentsize * ehdr->e_shnum; + + for (uint16_t i = 0; i < ehdr->e_shnum; ++i) { + uint64_t cur_offset = static_cast(shdr[i].sh_offset); + if (max_offset < cur_offset) { + max_offset = cur_offset; + total_size = max_offset; + if (SHT_NOBITS != shdr[i].sh_type) { + total_size += static_cast(shdr[i].sh_size); + } + } + } + + return total_size; + } + + std::string GetNoteString(uint32_t s_size, const char* s) + { + if (!s_size) { return ""; } + if (s[s_size-1] == '\0') { + return std::string(s, s_size-1); + } else { + return std::string(s, s_size); + } + } + + } +} diff --git a/runtime/hsa-runtime/libamdhsacode/amd_hsa_code.cpp b/runtime/hsa-runtime/libamdhsacode/amd_hsa_code.cpp new file mode 100644 index 0000000000..8e4b3051d9 --- /dev/null +++ b/runtime/hsa-runtime/libamdhsacode/amd_hsa_code.cpp @@ -0,0 +1,1340 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#include +#include +#include +#include +#include "amd_hsa_code.hpp" +#include "amd_hsa_code_util.hpp" +#include +#include "amd_hsa_elf.h" +#include +#include +#include +#include + + +#ifndef _WIN32 +#define _alloca alloca +#endif + +namespace amd { +namespace hsa { +namespace code { + + using amd::elf::GetNoteString; + + bool Symbol::IsDeclaration() const + { + return elfsym->type() == STT_COMMON; + } + + bool Symbol::IsDefinition() const + { + return !IsDeclaration(); + } + + bool Symbol::IsAgent() const + { + return elfsym->section()->flags() & SHF_AMDGPU_HSA_AGENT ? true : false; + } + + hsa_symbol_linkage_t Symbol::Linkage() const + { + return elfsym->binding() == STB_GLOBAL ? HSA_SYMBOL_LINKAGE_PROGRAM : HSA_SYMBOL_LINKAGE_MODULE; + } + + hsa_variable_allocation_t Symbol::Allocation() const + { + return IsAgent() ? HSA_VARIABLE_ALLOCATION_AGENT : HSA_VARIABLE_ALLOCATION_PROGRAM; + } + + hsa_variable_segment_t Symbol::Segment() const + { + return elfsym->section()->flags() & SHF_AMDGPU_HSA_READONLY ? HSA_VARIABLE_SEGMENT_READONLY : HSA_VARIABLE_SEGMENT_GLOBAL; + } + + uint64_t Symbol::Size() const + { + return elfsym->size(); + } + + uint32_t Symbol::Size32() const + { + assert(elfsym->size() < UINT32_MAX); + return (uint32_t) Size(); + } + + uint32_t Symbol::Alignment() const + { + assert(elfsym->section()->addralign() < UINT32_MAX); + return uint32_t(elfsym->section()->addralign()); + } + + bool Symbol::IsConst() const + { + return elfsym->section()->flags() & SHF_WRITE ? true : false; + } + + hsa_status_t Symbol::GetInfo(hsa_code_symbol_info_t attribute, void *value) + { + assert(value); + std::string name = Name(); + switch (attribute) { + case HSA_CODE_SYMBOL_INFO_TYPE: { + *((hsa_symbol_kind_t*)value) = Kind(); + break; + } + case HSA_CODE_SYMBOL_INFO_NAME_LENGTH: + case HSA_CODE_SYMBOL_INFO_NAME: { + std::string matter = ""; + switch (Linkage()) { + case HSA_SYMBOL_LINKAGE_PROGRAM: + assert(name.rfind(":") == std::string::npos); + matter = name; + break; + case HSA_SYMBOL_LINKAGE_MODULE: + assert(name.rfind(":") != std::string::npos); + matter = name.substr(name.rfind(":") + 1); + break; + default: + assert(!"Unsupported linkage in Symbol::GetInfo"); + return HSA_STATUS_ERROR; + } + if (attribute == HSA_CODE_SYMBOL_INFO_NAME_LENGTH) { + *((uint32_t*) value) = matter.size() + 1; + } else { + memset(value, 0x0, matter.size() + 1); + memcpy(value, matter.c_str(), matter.size()); + } + break; + } + case HSA_CODE_SYMBOL_INFO_MODULE_NAME_LENGTH: + case HSA_CODE_SYMBOL_INFO_MODULE_NAME: { + switch (Linkage()) { + case HSA_SYMBOL_LINKAGE_PROGRAM: + if (attribute == HSA_CODE_SYMBOL_INFO_MODULE_NAME_LENGTH) { + *((uint32_t*) value) = 0; + } + break; + case HSA_SYMBOL_LINKAGE_MODULE: { + assert(name.find(":") != std::string::npos); + std::string matter = name.substr(0, name.find(":")); + if (attribute == HSA_CODE_SYMBOL_INFO_MODULE_NAME_LENGTH) { + *((uint32_t*) value) = matter.length() + 1; + } else { + memset(value, 0x0, matter.size() + 1); + memcpy(value, matter.c_str(), matter.length()); + ((char*)value)[matter.size() + 1] = '\0'; + } + break; + } + default: + assert(!"Unsupported linkage in Symbol::GetInfo"); + return HSA_STATUS_ERROR; + } + break; + } + case HSA_CODE_SYMBOL_INFO_LINKAGE: { + *((hsa_symbol_linkage_t*)value) = Linkage(); + break; + } + case HSA_CODE_SYMBOL_INFO_IS_DEFINITION: { + *((bool*)value) = IsDefinition(); + break; + } + default: { + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + } + return HSA_STATUS_SUCCESS; + } + + hsa_code_symbol_t Symbol::ToHandle(Symbol* sym) + { + hsa_code_symbol_t s; + s.handle = reinterpret_cast(sym); + return s; + } + + Symbol* Symbol::FromHandle(hsa_code_symbol_t s) + { + return reinterpret_cast(s.handle); + } + + KernelSymbol::KernelSymbol(amd::elf::Symbol* elfsym_, const amd_kernel_code_t* akc) + : Symbol(elfsym_) + , kernarg_segment_size(0) + , kernarg_segment_alignment(0) + , group_segment_size(0) + , private_segment_size(0) + , is_dynamic_callstack(0) + { + if (akc) { + kernarg_segment_size = (uint32_t) akc->kernarg_segment_byte_size; + kernarg_segment_alignment = (uint32_t) (1 << akc->kernarg_segment_alignment); + group_segment_size = uint32_t(akc->workgroup_group_segment_byte_size); + private_segment_size = uint32_t(akc->workitem_private_segment_byte_size); + is_dynamic_callstack = + AMD_HSA_BITS_GET(akc->kernel_code_properties, AMD_KERNEL_CODE_PROPERTIES_IS_DYNAMIC_CALLSTACK) ? true : false; + } + } + + hsa_status_t KernelSymbol::GetInfo(hsa_code_symbol_info_t attribute, void *value) + { + assert(value); + switch (attribute) { + case HSA_CODE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE: { + *((uint32_t*)value) = kernarg_segment_size; + break; + } + case HSA_CODE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_ALIGNMENT: { + *((uint32_t*)value) = kernarg_segment_alignment; + break; + } + case HSA_CODE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE: { + *((uint32_t*)value) = group_segment_size; + break; + } + case HSA_CODE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE: { + *((uint32_t*)value) = private_segment_size; + break; + } + case HSA_CODE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK: { + *((bool*)value) = is_dynamic_callstack; + break; + } + default: { + return Symbol::GetInfo(attribute, value); + } + } + return HSA_STATUS_SUCCESS; + } + + hsa_status_t VariableSymbol::GetInfo(hsa_code_symbol_info_t attribute, void *value) + { + assert(value); + switch (attribute) { + case HSA_CODE_SYMBOL_INFO_VARIABLE_ALLOCATION: { + *((hsa_variable_allocation_t*)value) = Allocation(); + break; + } + case HSA_CODE_SYMBOL_INFO_VARIABLE_SEGMENT: { + *((hsa_variable_segment_t*)value) = Segment(); + break; + } + case HSA_CODE_SYMBOL_INFO_VARIABLE_ALIGNMENT: { + *((uint32_t*)value) = Alignment(); + break; + } + case HSA_CODE_SYMBOL_INFO_VARIABLE_SIZE: { + *((uint32_t*)value) = Size(); + break; + } + case HSA_CODE_SYMBOL_INFO_VARIABLE_IS_CONST: { + *((bool*)value) = IsConst(); + break; + } + default: { + return Symbol::GetInfo(attribute, value); + } + } + return HSA_STATUS_SUCCESS; + } + + AmdHsaCode::AmdHsaCode(bool combineDataSegments_) + : img(nullptr), + combineDataSegments(combineDataSegments_), + hsatext(0), imageInit(0), samplerInit(0), + debugInfo(0), debugLine(0), debugAbbrev(0) + { + for (unsigned i = 0; i < AMDGPU_HSA_SEGMENT_LAST; ++i) { + for (unsigned j = 0; j < 2; ++j) { + hsaSegments[i][j] = 0; + } + } + for (unsigned i = 0; i < AMDGPU_HSA_SECTION_LAST; ++i) { + hsaSections[i] = 0; + } + } + + AmdHsaCode::~AmdHsaCode() + { + for (Symbol* sym : symbols) { delete sym; } + } + + bool AmdHsaCode::PullElf() + { + uint32_t majorVersion, minorVersion; + if (!GetNoteCodeObjectVersion(&majorVersion, &minorVersion)) { + return false; + } + if (majorVersion >= 2) { + return PullElfV2(); + } else { + return PullElfV1(); + } + } + + bool AmdHsaCode::PullElfV1() + { + for (size_t i = 0; i < img->segmentCount(); ++i) { + Segment* s = img->segment(i); + if (s->type() == PT_AMDGPU_HSA_LOAD_GLOBAL_PROGRAM || + s->type() == PT_AMDGPU_HSA_LOAD_GLOBAL_AGENT || + s->type() == PT_AMDGPU_HSA_LOAD_READONLY_AGENT || + s->type() == PT_AMDGPU_HSA_LOAD_CODE_AGENT) { + dataSegments.push_back(s); + } + } + for (size_t i = 0; i < img->sectionCount(); ++i) { + Section* sec = img->section(i); + if (!sec) { continue; } + if ((sec->type() == SHT_PROGBITS || sec->type() == SHT_NOBITS) && + (sec->flags() & (SHF_AMDGPU_HSA_AGENT | SHF_AMDGPU_HSA_GLOBAL | SHF_AMDGPU_HSA_READONLY | SHF_AMDGPU_HSA_CODE))) { + dataSections.push_back(sec); + } else if (sec->type() == SHT_RELA) { + relocationSections.push_back(sec->asRelocationSection()); + } + if (sec->Name() == ".hsatext") { + hsatext = sec; + } + } + for (size_t i = 0; i < img->symtab()->symbolCount(); ++i) { + amd::elf::Symbol* elfsym = img->symtab()->symbol(i); + Symbol* sym = 0; + switch (elfsym->type()) { + case STT_AMDGPU_HSA_KERNEL: { + amd::elf::Section* sec = elfsym->section(); + amd_kernel_code_t akc; + if (!sec) { + out << "Failed to find section for symbol " << elfsym->name() << std::endl; + return false; + } + if (!(sec->flags() & (SHF_AMDGPU_HSA_AGENT | SHF_AMDGPU_HSA_CODE | SHF_EXECINSTR))) { + out << "Invalid code section for symbol " << elfsym->name() << std::endl; + return false; + } + if (!sec->getData(elfsym->value(), &akc, sizeof(amd_kernel_code_t))) { + out << "Failed to get AMD Kernel Code for symbol " << elfsym->name() << std::endl; + return false; + } + sym = new KernelSymbol(elfsym, &akc); + break; + } + case STT_OBJECT: + case STT_COMMON: + sym = new VariableSymbol(elfsym); + break; + default: + break; // Skip unknown symbols. + } + if (sym) { symbols.push_back(sym); } + } + + return true; + } + + bool AmdHsaCode::LoadFromFile(const std::string& filename) + { + if (!img) { img.reset(amd::elf::NewElf64Image()); } + if (!img->loadFromFile(filename)) { return ElfImageError(); } + if (!PullElf()) { return ElfImageError(); } + return true; + } + + bool AmdHsaCode::SaveToFile(const std::string& filename) + { + return img->saveToFile(filename) || ElfImageError(); + } + + bool AmdHsaCode::WriteToBuffer(void* buffer) + { + return img->copyToBuffer(buffer, ElfSize()) || ElfImageError(); + } + + + bool AmdHsaCode::InitFromBuffer(const void* buffer, size_t size) + { + if (!img) { img.reset(amd::elf::NewElf64Image()); } + if (!img->initFromBuffer(buffer, size)) { return ElfImageError(); } + if (!PullElf()) { return ElfImageError(); } + return true; + } + + bool AmdHsaCode::InitAsBuffer(const void* buffer, size_t size) + { + if (!img) { img.reset(amd::elf::NewElf64Image()); } + if (!img->initAsBuffer(buffer, size)) { return ElfImageError(); } + if (!PullElf()) { return ElfImageError(); } + return true; + } + + bool AmdHsaCode::InitAsHandle(hsa_code_object_t code_object) + { + void *elfmemrd = reinterpret_cast(code_object.handle); + if (!elfmemrd) { return false; } + return InitAsBuffer(elfmemrd, 0); + } + + bool AmdHsaCode::InitNew(bool xnack) + { + if (!img) { + img.reset(amd::elf::NewElf64Image()); + uint32_t flags = 0; + if (xnack) { flags |= EF_AMDGPU_XNACK; } + return img->initNew(EM_AMDGPU, ET_EXEC, ELFOSABI_AMDGPU_HSA, ELFABIVERSION_AMDGPU_HSA, flags) || + ElfImageError(); // FIXME: elfutils libelf does not allow program headers in ET_REL file type, so change it later in finalizer. + } + return false; + } + + bool AmdHsaCode::Freeze() + { + return img->Freeze() || ElfImageError(); + } + + hsa_code_object_t AmdHsaCode::GetHandle() + { + hsa_code_object_t code_object; + code_object.handle = reinterpret_cast(img->data()); + return code_object; + } + + const char* AmdHsaCode::ElfData() + { + return img->data(); + } + + uint64_t AmdHsaCode::ElfSize() + { + return img->size(); + } + + bool AmdHsaCode::Validate() + { + if (!img->Validate()) { return ElfImageError(); } + if (img->Machine() != EM_AMDGPU) { + out << "ELF error: Invalid machine" << std::endl; + return false; + } + return true; + } + + void AmdHsaCode::AddAmdNote(uint32_t type, const void* desc, uint32_t desc_size) + { + img->note()->addNote("AMD", type, desc, desc_size); + } + + void AmdHsaCode::AddNoteCodeObjectVersion(uint32_t major, uint32_t minor) + { + amdgpu_hsa_note_code_object_version_t desc; + desc.major_version = major; + desc.minor_version = minor; + AddAmdNote(NT_AMDGPU_HSA_CODE_OBJECT_VERSION, &desc, sizeof(desc)); + } + + bool AmdHsaCode::GetNoteCodeObjectVersion(uint32_t* major, uint32_t* minor) + { + amdgpu_hsa_note_code_object_version_t* desc; + if (!GetAmdNote(NT_AMDGPU_HSA_CODE_OBJECT_VERSION, &desc)) { return false; } + *major = desc->major_version; + *minor = desc->minor_version; + return true; + } + + bool AmdHsaCode::GetNoteCodeObjectVersion(std::string& version) + { + amdgpu_hsa_note_code_object_version_t* desc; + if (!GetAmdNote(NT_AMDGPU_HSA_CODE_OBJECT_VERSION, &desc)) { return false; } + version.clear(); + version += std::to_string(desc->major_version); + version += "."; + version += std::to_string(desc->minor_version); + return true; + } + + void AmdHsaCode::AddNoteHsail(uint32_t hsail_major, uint32_t hsail_minor, hsa_profile_t profile, hsa_machine_model_t machine_model, hsa_default_float_rounding_mode_t rounding_mode) + { + amdgpu_hsa_note_hsail_t desc; + memset(&desc, 0, sizeof(desc)); + desc.hsail_major_version = hsail_major; + desc.hsail_minor_version = hsail_minor; + desc.profile = uint8_t(profile); + desc.machine_model = uint8_t(machine_model); + desc.default_float_round = uint8_t(rounding_mode); + AddAmdNote(NT_AMDGPU_HSA_HSAIL, &desc, sizeof(desc)); + } + + bool AmdHsaCode::GetNoteHsail(uint32_t* hsail_major, uint32_t* hsail_minor, hsa_profile_t* profile, hsa_machine_model_t* machine_model, hsa_default_float_rounding_mode_t* default_float_round) + { + amdgpu_hsa_note_hsail_t *desc; + if (!GetAmdNote(NT_AMDGPU_HSA_HSAIL, &desc)) { return false; } + *hsail_major = desc->hsail_major_version; + *hsail_minor = desc->hsail_minor_version; + *profile = (hsa_profile_t) desc->profile; + *machine_model = (hsa_machine_model_t) desc->machine_model; + *default_float_round = (hsa_default_float_rounding_mode_t) desc->default_float_round; + return true; + } + + void AmdHsaCode::AddNoteIsa(const std::string& vendor_name, const std::string& architecture_name, uint32_t major, uint32_t minor, uint32_t stepping) + { + size_t size = sizeof(amdgpu_hsa_note_producer_t) + vendor_name.length() + architecture_name.length() + 1; + amdgpu_hsa_note_isa_t* desc = (amdgpu_hsa_note_isa_t*) _alloca(size); + memset(desc, 0, size); + desc->vendor_name_size = vendor_name.length()+1; + desc->architecture_name_size = architecture_name.length()+1; + desc->major = major; + desc->minor = minor; + desc->stepping = stepping; + memcpy(desc->vendor_and_architecture_name, vendor_name.c_str(), vendor_name.length() + 1); + memcpy(desc->vendor_and_architecture_name + desc->vendor_name_size, architecture_name.c_str(), architecture_name.length() + 1); + AddAmdNote(NT_AMDGPU_HSA_ISA, desc, size); + } + + bool AmdHsaCode::GetNoteIsa(std::string& vendor_name, std::string& architecture_name, uint32_t* major_version, uint32_t* minor_version, uint32_t* stepping) + { + amdgpu_hsa_note_isa_t *desc; + if (!GetAmdNote(NT_AMDGPU_HSA_ISA, &desc)) { return false; } + vendor_name = GetNoteString(desc->vendor_name_size, desc->vendor_and_architecture_name); + architecture_name = GetNoteString(desc->architecture_name_size, desc->vendor_and_architecture_name + vendor_name.length() + 1); + *major_version = desc->major; + *minor_version = desc->minor; + *stepping = desc->stepping; + return true; + } + + bool AmdHsaCode::GetNoteIsa(std::string& isaName) + { + std::string vendor_name, architecture_name; + uint32_t major_version, minor_version, stepping; + if (!GetNoteIsa(vendor_name, architecture_name, &major_version, &minor_version, &stepping)) { return false; } + isaName.clear(); + isaName += vendor_name; + isaName += ":"; + isaName += architecture_name; + isaName += ":"; + isaName += std::to_string(major_version); + isaName += ":"; + isaName += std::to_string(minor_version); + isaName += ":"; + isaName += std::to_string(stepping); + return true; + } + + void AmdHsaCode::AddNoteProducer(uint32_t major, uint32_t minor, const std::string& producer) + { + size_t size = sizeof(amdgpu_hsa_note_producer_t) + producer.length(); + amdgpu_hsa_note_producer_t* desc = (amdgpu_hsa_note_producer_t*) _alloca(size); + memset(desc, 0, size); + desc->producer_name_size = producer.length(); + desc->producer_major_version = major; + desc->producer_minor_version = minor; + memcpy(desc->producer_name, producer.c_str(), producer.length() + 1); + AddAmdNote(NT_AMDGPU_HSA_PRODUCER, desc, size); + } + + bool AmdHsaCode::GetNoteProducer(uint32_t* major, uint32_t* minor, std::string& producer_name) + { + amdgpu_hsa_note_producer_t* desc; + if (!GetAmdNote(NT_AMDGPU_HSA_PRODUCER, &desc)) { return false; } + *major = desc->producer_major_version; + *minor = desc->producer_minor_version; + producer_name = GetNoteString(desc->producer_name_size, desc->producer_name); + return true; + } + + void AmdHsaCode::AddNoteProducerOptions(const std::string& options) + { + size_t size = sizeof(amdgpu_hsa_note_producer_options_t) + options.length(); + amdgpu_hsa_note_producer_options_t *desc = (amdgpu_hsa_note_producer_options_t*) _alloca(size); + desc->producer_options_size = options.length(); + memcpy(desc->producer_options, options.c_str(), options.length() + 1); + AddAmdNote(NT_AMDGPU_HSA_PRODUCER_OPTIONS, desc, size); + } + + void AmdHsaCode::AddNoteProducerOptions(int32_t call_convention, const hsa_ext_control_directives_t& user_directives, const std::string& user_options) + { + using namespace code_options; + std::ostringstream ss; + ss << + space << "-hsa_call_convention=" << call_convention << + control_directives(user_directives); + if (!user_options.empty()) { + ss << space << user_options; + } + + AddNoteProducerOptions(ss.str()); + } + + bool AmdHsaCode::GetNoteProducerOptions(std::string& options) + { + amdgpu_hsa_note_producer_options_t* desc; + if (!GetAmdNote(NT_AMDGPU_HSA_PRODUCER_OPTIONS, &desc)) { return false; } + options = GetNoteString(desc->producer_options_size, desc->producer_options); + return true; + } + + hsa_status_t AmdHsaCode::GetInfo(hsa_code_object_info_t attribute, void *value) + { + assert(value); + switch (attribute) { + case HSA_CODE_OBJECT_INFO_VERSION: { + std::string version; + if (!GetNoteCodeObjectVersion(version)) { return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; } + char *svalue = (char*)value; + memset(svalue, 0x0, 64); + memcpy(svalue, version.c_str(), (std::min)(size_t(63), version.length())); + break; + } + case HSA_CODE_OBJECT_INFO_ISA: { + // TODO: Currently returns string representation instead of hsa_isa_t + // which is unavailable here. + std::string isa; + if (!GetNoteIsa(isa)) { return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; } + char *svalue = (char*)value; + memset(svalue, 0x0, 64); + memcpy(svalue, isa.c_str(), (std::min)(size_t(63), isa.length())); + break; + } + case HSA_CODE_OBJECT_INFO_MACHINE_MODEL: + case HSA_CODE_OBJECT_INFO_PROFILE: + case HSA_CODE_OBJECT_INFO_DEFAULT_FLOAT_ROUNDING_MODE: { + uint32_t hsail_major, hsail_minor; + hsa_profile_t profile; + hsa_machine_model_t machine_model; + hsa_default_float_rounding_mode_t default_float_round; + if (!GetNoteHsail(&hsail_major, &hsail_minor, &profile, &machine_model, &default_float_round)) { + return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; + } + switch (attribute) { + case HSA_CODE_OBJECT_INFO_MACHINE_MODEL: + *((hsa_machine_model_t*)value) = machine_model; break; + case HSA_CODE_OBJECT_INFO_PROFILE: + *((hsa_profile_t*)value) = profile; break; + case HSA_CODE_OBJECT_INFO_DEFAULT_FLOAT_ROUNDING_MODE: + *((hsa_default_float_rounding_mode_t*)value) = default_float_round; break; + default: break; + } + break; + } + default: + assert(false); + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + return HSA_STATUS_SUCCESS; + } + + hsa_status_t AmdHsaCode::GetSymbol(const char *module_name, const char *symbol_name, hsa_code_symbol_t *s) + { + std::string mname = MangleSymbolName(module_name ? module_name : "", symbol_name); + for (Symbol* sym : symbols) { + if (sym->Name() == mname) { + *s = Symbol::ToHandle(sym); + return HSA_STATUS_SUCCESS; + } + } + return HSA_STATUS_ERROR_INVALID_SYMBOL_NAME; + } + + hsa_status_t AmdHsaCode::IterateSymbols(hsa_code_object_t code_object, + hsa_status_t (*callback)( + hsa_code_object_t code_object, + hsa_code_symbol_t symbol, + void* data), + void* data) + { + for (Symbol* sym : symbols) { + hsa_code_symbol_t s = Symbol::ToHandle(sym); + hsa_status_t status = callback(code_object, s, data); + if (status != HSA_STATUS_SUCCESS) { return status; } + } + return HSA_STATUS_SUCCESS; + } + + Section* AmdHsaCode::ImageInitSection() + { + if (!imageInit) { + imageInit = img->addSection( + ".hsaimage_imageinit", + SHT_PROGBITS, + SHF_MERGE, + sizeof(amdgpu_hsa_image_descriptor_t)); + } + return imageInit; + } + + void AmdHsaCode::AddImageInitializer(Symbol* image, uint64_t destOffset, const amdgpu_hsa_image_descriptor_t& desc) + { + uint64_t offset = ImageInitSection()->addData(&desc, sizeof(desc), 8); + amd::elf::Symbol* imageInit = + img->symtab()->addSymbol(ImageInitSection(), "", offset, 0, STT_AMDGPU_HSA_METADATA, STB_LOCAL); + image->elfSym()->section()->relocationSection()->addRelocation(R_AMDGPU_INIT_IMAGE, imageInit, image->elfSym()->value() + destOffset, 0); + } + + void AmdHsaCode::AddImageInitializer( + Symbol* image, uint64_t destOffset, + amdgpu_hsa_metadata_kind16_t kind, + amdgpu_hsa_image_geometry8_t geometry, + amdgpu_hsa_image_channel_order8_t channel_order, amdgpu_hsa_image_channel_type8_t channel_type, + uint64_t width, uint64_t height, uint64_t depth, uint64_t array) + { + amdgpu_hsa_image_descriptor_t desc; + desc.size = (uint16_t) sizeof(amdgpu_hsa_image_descriptor_t); + desc.kind = kind; + desc.geometry = geometry; + desc.channel_order = channel_order; + desc.channel_type = channel_type; + desc.width = width; + desc.height = height; + desc.depth = depth; + desc.array = array; + AddImageInitializer(image, destOffset, desc); + } + + + Section* AmdHsaCode::SamplerInitSection() + { + if (!samplerInit) { + samplerInit = img->addSection( + ".hsaimage_samplerinit", + SHT_PROGBITS, + SHF_MERGE, + sizeof(amdgpu_hsa_sampler_descriptor_t)); + } + return samplerInit; + } + + void AmdHsaCode::AddSamplerInitializer(Symbol* sampler, uint64_t destOffset, const amdgpu_hsa_sampler_descriptor_t& desc) + { + uint64_t offset = SamplerInitSection()->addData(&desc, sizeof(desc), 8); + amd::elf::Symbol* samplerInit = + img->symtab()->addSymbol(SamplerInitSection(), "", offset, 0, STT_AMDGPU_HSA_METADATA, STB_LOCAL); + sampler->elfSym()->section()->relocationSection()->addRelocation(R_AMDGPU_INIT_SAMPLER, samplerInit, sampler->elfSym()->value() + destOffset, 0); + } + + void AmdHsaCode::AddSamplerInitializer(Symbol* sampler, uint64_t destOffset, + amdgpu_hsa_sampler_coord8_t coord, + amdgpu_hsa_sampler_filter8_t filter, + amdgpu_hsa_sampler_addressing8_t addressing) + { + amdgpu_hsa_sampler_descriptor_t desc; + desc.size = (uint16_t) sizeof(amdgpu_hsa_sampler_descriptor_t); + desc.kind = AMDGPU_HSA_METADATA_KIND_INIT_SAMP; + desc.coord = coord; + desc.filter = filter; + desc.addressing = addressing; + AddSamplerInitializer(sampler, destOffset, desc); + } + + void AmdHsaCode::AddInitVarWithAddress(bool large, Symbol* dest, uint64_t destOffset, Symbol* addrOf, uint64_t addrAddend) + { + uint32_t rtype = large ? R_AMDGPU_64 : R_AMDGPU_32_LOW; + dest->elfSym()->section()->relocationSection()->addRelocation(rtype, addrOf->elfSym(), dest->elfSym()->value() + destOffset, addrAddend); + } + + uint64_t AmdHsaCode::NextKernelCodeOffset() const + { + return HsaText()->nextDataOffset(256); + } + + bool AmdHsaCode::AddKernelCode(KernelSymbol* sym, const void* code, size_t size) + { + assert(nullptr != sym); + + uint64_t offset = HsaText()->addData(code, size, 256); + sym->setValue(offset); + sym->setSize(size); + return true; + } + + Section* AmdHsaCode::AddEmptySection() + { + dataSections.push_back(nullptr); return nullptr; + } + + Section* AmdHsaCode::AddCodeSection(Segment* segment) + { + if (nullptr == img) { return nullptr; } + Section *sec = img->addSection( + ".hsatext", + SHT_PROGBITS, + SHF_ALLOC | SHF_EXECINSTR | SHF_WRITE | SHF_AMDGPU_HSA_CODE | SHF_AMDGPU_HSA_AGENT, + 0, + segment); + dataSections.push_back(sec); + hsatext = sec; + return sec; + } + + Section* AmdHsaCode::AddDataSection(const std::string &name, + uint32_t type, + uint64_t flags, + Segment* segment) + { + if (nullptr == img) { return nullptr; } + Section *sec = img->addSection(name, type, flags, 0, segment); + dataSections.push_back(sec); + return sec; + } + + void AmdHsaCode::InitHsaSectionSegment(amdgpu_hsa_elf_section_t section, bool combineSegments) + { + InitHsaSegment(AmdHsaElfSectionSegment(section), combineSegments || !IsAmdHsaElfSectionROData(section)); + } + + Section* AmdHsaCode::HsaDataSection(amdgpu_hsa_elf_section_t sec, bool combineSegments) + { + if (!hsaSections[sec]) { + bool writable = combineSegments || !IsAmdHsaElfSectionROData(sec); + Segment* segment = HsaSegment(AmdHsaElfSectionSegment(sec), writable); + assert(segment); // Expected to be init the segment via InitHsaSegment. + Section* section; + switch (sec) { + case AMDGPU_HSA_RODATA_GLOBAL_PROGRAM: + section = AddDataSection(".hsarodata_global_program", SHT_PROGBITS, SHF_ALLOC | SHF_AMDGPU_HSA_GLOBAL, segment); break; + case AMDGPU_HSA_RODATA_GLOBAL_AGENT: + section = AddDataSection(".hsarodata_global_agent", SHT_PROGBITS, SHF_ALLOC | SHF_AMDGPU_HSA_GLOBAL | SHF_AMDGPU_HSA_AGENT, segment); break; + case AMDGPU_HSA_RODATA_READONLY_AGENT: + section = AddDataSection(".hsarodata_readonly_agent", SHT_PROGBITS, SHF_ALLOC | SHF_AMDGPU_HSA_READONLY | SHF_AMDGPU_HSA_AGENT, segment); break; + case AMDGPU_HSA_DATA_GLOBAL_PROGRAM: + section = AddDataSection(".hsadata_global_program", SHT_PROGBITS, SHF_ALLOC | SHF_WRITE | SHF_AMDGPU_HSA_GLOBAL, segment); break; + case AMDGPU_HSA_DATA_GLOBAL_AGENT: + section = AddDataSection(".hsadata_global_agent", SHT_PROGBITS, SHF_ALLOC | SHF_WRITE | SHF_AMDGPU_HSA_GLOBAL | SHF_AMDGPU_HSA_AGENT, segment); break; + case AMDGPU_HSA_DATA_READONLY_AGENT: + section = AddDataSection(".hsadata_readonly_agent", SHT_PROGBITS, SHF_ALLOC | SHF_WRITE | SHF_AMDGPU_HSA_READONLY | SHF_AMDGPU_HSA_AGENT, segment); break; + case AMDGPU_HSA_BSS_GLOBAL_PROGRAM: + section = AddDataSection(".hsabss_global_program", SHT_NOBITS, SHF_ALLOC | SHF_WRITE | SHF_AMDGPU_HSA_GLOBAL, segment); break; + case AMDGPU_HSA_BSS_GLOBAL_AGENT: + section = AddDataSection(".hsabss_global_agent", SHT_NOBITS, SHF_ALLOC | SHF_WRITE | SHF_AMDGPU_HSA_GLOBAL | SHF_AMDGPU_HSA_AGENT, segment); break; + case AMDGPU_HSA_BSS_READONLY_AGENT: + section = AddDataSection(".hsabss_readonly_agent", SHT_NOBITS, SHF_ALLOC | SHF_WRITE | SHF_AMDGPU_HSA_READONLY | SHF_AMDGPU_HSA_AGENT, segment); break; + default: + assert(false); return 0; + } + hsaSections[sec] = section; + } + return hsaSections[sec]; + } + + void AmdHsaCode::InitHsaSegment(amdgpu_hsa_elf_segment_t segment, bool writable) + { + if (!hsaSegments[segment][writable]) { + uint32_t flags = PF_R; + if (writable) { flags |= PF_W; } + if (segment == AMDGPU_HSA_SEGMENT_CODE_AGENT) { flags |= PF_X; } + uint32_t type = PT_LOOS + segment; + assert(segment < AMDGPU_HSA_SEGMENT_LAST); + hsaSegments[segment][writable] = img->initSegment(type, flags); + } + } + + bool AmdHsaCode::AddHsaSegments() + { + if (!img->addSegments()) { return ElfImageError(); } + return true; + } + + Segment* AmdHsaCode::HsaSegment(amdgpu_hsa_elf_segment_t segment, bool writable) + { + return hsaSegments[segment][writable]; + } + + Symbol* AmdHsaCode::AddExecutableSymbol(const std::string &name, + unsigned char type, + unsigned char binding, + unsigned char other, + Section *section) + { + if (nullptr == img) { return nullptr; } + if (!section) { section = HsaText(); } + symbols.push_back(new KernelSymbol(img->symtab()->addSymbol(section, name, 0, 0, type, binding, other), nullptr)); + return symbols.back(); + } + + Symbol* AmdHsaCode::AddVariableSymbol(const std::string &name, + unsigned char type, + unsigned char binding, + unsigned char other, + Section *section, + uint64_t value, + uint64_t size) + { + if (nullptr == img) { return nullptr; } + symbols.push_back(new VariableSymbol(img->symtab()->addSymbol(section, name, value, size, type, binding, other))); + return symbols.back(); + } + + void AmdHsaCode::AddSectionSymbols() + { + if (nullptr == img) { return; } + for (size_t i = 0; i < dataSections.size(); ++i) { + if (dataSections[i] && dataSections[i]->flags() & SHF_ALLOC) { + symbols.push_back(new VariableSymbol(img->symtab()->addSymbol(dataSections[i], "__hsa_section" + dataSections[i]->Name(), 0, 0, STT_SECTION, STB_LOCAL))); + } + } + } + + Symbol* AmdHsaCode::GetSymbolByElfIndex(size_t index) + { + for (auto &s : symbols) { + if (s && index == s->Index()) { + return s; + } + } + return nullptr; + } + + Symbol* AmdHsaCode::FindSymbol(const std::string &n) + { + for (auto &s : symbols) { + if (s && n == s->Name()) { + return s; + } + } + return nullptr; + } + + void AmdHsaCode::AddData(amdgpu_hsa_elf_section_t s, const void* data, size_t size) + { +// getDataSection(s)->addData(data, size); + } + + Section* AmdHsaCode::DebugInfo() + { + if (!debugInfo) { + debugInfo = img->addSection(".debug_info", SHT_PROGBITS); + } + return debugInfo; + } + + Section* AmdHsaCode::DebugLine() + { + if (!debugLine) { + debugLine = img->addSection(".debug_line", SHT_PROGBITS); + } + return debugLine; + } + + Section* AmdHsaCode::DebugAbbrev() + { + if (!debugAbbrev) { + debugAbbrev = img->addSection(".debug_abbrev", SHT_PROGBITS); + } + return debugAbbrev; + } + + Section* AmdHsaCode::AddHsaHlDebug(const std::string& name, const void* data, size_t size) + { + Section* section = img->addSection(name, SHT_PROGBITS, SHF_OS_NONCONFORMING); + section->addData(data, size, 1); + return section; + } + + bool AmdHsaCode::PrintToFile(const std::string& filename) + { + std::ofstream out(filename); + if (out.fail()) { return false; } + Print(out); + return out.fail(); + } + + void AmdHsaCode::Print(std::ostream& out) + { + PrintNotes(out); + out << std::endl; + PrintSegments(out); + out << std::endl; + PrintSections(out); + out << std::endl; + PrintSymbols(out); + out << std::endl; + PrintMachineCode(out); + out << "AMD HSA Code Object End" << std::endl; + } + + void AmdHsaCode::PrintNotes(std::ostream& out) + { + { + uint32_t major_version, minor_version; + if (GetNoteCodeObjectVersion(&major_version, &minor_version)) { + out << "AMD HSA Code Object" << std::endl + << " Version " << major_version << "." << minor_version << std::endl; + } + } + { + uint32_t hsail_major, hsail_minor; + hsa_profile_t profile; + hsa_machine_model_t machine_model; + hsa_default_float_rounding_mode_t rounding_mode; + if (GetNoteHsail(&hsail_major, &hsail_minor, &profile, &machine_model, &rounding_mode)) { + out << "HSAIL " << std::endl + << " Version: " << hsail_major << "." << hsail_minor << std::endl + << " Profile: " << HsaProfileToString(profile) + << " Machine model: " << HsaMachineModelToString(machine_model) + << " Default float rounding: " << HsaFloatRoundingModeToString(rounding_mode) << std::endl; + } + } + { + std::string vendor_name, architecture_name; + uint32_t major_version, minor_version, stepping; + if (GetNoteIsa(vendor_name, architecture_name, &major_version, &minor_version, &stepping)) { + out << "ISA" << std::endl + << " Vendor " << vendor_name + << " Arch " << architecture_name + << " Version " << major_version << ":" << minor_version << ":" << stepping << std::endl; + } + } + { + std::string producer_name, producer_options; + uint32_t major, minor; + if (GetNoteProducer(&major, &minor, producer_name)) { + out << "Producer '" << producer_name << "' " << "Version " << major << ":" << minor << std::endl; + } + } + { + std::string producer_options; + if (GetNoteProducerOptions(producer_options)) { + out << "Producer options" << std::endl + << " '" << producer_options << "'" << std::endl; + } + } + } + + void AmdHsaCode::PrintSegments(std::ostream& out) + { + out << "Segments (total " << DataSegmentCount() << "):" << std::endl; + for (size_t i = 0; i < DataSegmentCount(); ++i) { + PrintSegment(out, DataSegment(i)); + } + } + + void AmdHsaCode::PrintSections(std::ostream& out) + { + out << "Data Sections (total " << DataSectionCount() << "):" << std::endl; + for (size_t i = 0; i < DataSectionCount(); ++i) { + PrintSection(out, DataSection(i)); + } + out << std::endl; + out << "Relocation Sections (total " << RelocationSectionCount() << "):" << std::endl; + for (size_t i = 0; i < RelocationSectionCount(); ++i) { + PrintSection(out, GetRelocationSection(i)); + } + } + + void AmdHsaCode::PrintSymbols(std::ostream& out) + { + out << "Symbols (total " << SymbolCount() << "):" << std::endl; + for (size_t i = 0; i < SymbolCount(); ++i) { + PrintSymbol(out, GetSymbol(i)); + } + } + + void AmdHsaCode::PrintMachineCode(std::ostream& out) + { + if (HasHsaText()) { + out << std::dec; + for (size_t i = 0; i < SymbolCount(); ++i) { + Symbol* sym = GetSymbol(i); + if (sym->IsKernelSymbol() && sym->IsDefinition()) { + amd_kernel_code_t kernel_code; + HsaText()->getData(sym->SectionOffset(), &kernel_code, sizeof(amd_kernel_code_t)); + out << "AMD Kernel Code for " << sym->Name() << ": " << std::endl << std::dec; + PrintAmdKernelCode(out, &kernel_code); + out << std::endl; + } + } + + std::vector isa(HsaText()->size(), 0); + HsaText()->getData(0, isa.data(), HsaText()->size()); + + out << "Disassembly:" << std::endl; + PrintDisassembly(out, isa.data(), HsaText()->size(), 0); + out << std::endl << std::dec; + } else { + out << "Machine code section is not present" << std::endl << std::endl; + } + } + + void AmdHsaCode::PrintSegment(std::ostream& out, Segment* segment) + { + out << " Segment (" << segment->getSegmentIndex() << ")" << std::endl; + out << " Type: " << AmdPTLoadToString(segment->type()) + << " " + << " Flags: " << "0x" << std::hex << std::setw(8) << std::setfill('0') << segment->flags() << std::dec + << std::endl + << " Image Size: " << segment->imageSize() + << " " + << " Memory Size: " << segment->memSize() + << " " + << " Align: " << segment->align() + << " " + << " VAddr: " << segment->vaddr() + << std::endl; + out << std::dec; + } + + void AmdHsaCode::PrintSection(std::ostream& out, Section* section) + { + out << " Section " << section->Name() << " (Index " << section->getSectionIndex() << ")" << std::endl; + out << " Type: " << section->type() + << " " + << " Flags: " << "0x" << std::hex << std::setw(8) << std::setfill('0') << section->flags() << std::dec + << std::endl + << " Size: " << section->size() + << " " + << " Address: " << section->addr() + << " " + << " Align: " << section->addralign() + << std::endl; + out << std::dec; + + if (section->flags() & SHF_AMDGPU_HSA_CODE) { + // Printed separately. + return; + } + + switch (section->type()) { + case SHT_NOBITS: + return; + case SHT_RELA: + PrintRelocationData(out, section->asRelocationSection()); + return; + default: + PrintRawData(out, section); + } + } + + void AmdHsaCode::PrintRawData(std::ostream& out, Section* section) + { + out << " Data:" << std::endl; + unsigned char *sdata = (unsigned char*)alloca(section->size()); + section->getData(0, sdata, section->size()); + PrintRawData(out, sdata, section->size()); + } + + void AmdHsaCode::PrintRawData(std::ostream& out, const unsigned char *data, size_t size) + { + out << std::hex << std::setfill('0'); + for (size_t i = 0; i < size; i += 16) { + out << " " << std::setw(7) << i << ":"; + + for (size_t j = 0; j < 16; j += 1) { + uint32_t value = i + j < size ? (uint32_t)data[i + j] : 0; + if (j % 2 == 0) { out << ' '; } + out << std::setw(2) << value; + } + out << " "; + + for (size_t j = 0; i + j < size && j < 16; j += 1) { + char value = (char)data[i + j] >= 32 && (char)data[i + j] <= 126 ? (char)data[i + j] : '.'; + out << value; + } + out << std::endl; + } + out << std::dec; + } + + void AmdHsaCode::PrintRelocationData(std::ostream& out, RelocationSection* section) + { + out << " Relocation Entries for " << section->targetSection()->Name() << " Section (total " << section->relocationCount() << "):" << std::endl; + for (size_t i = 0; i < section->relocationCount(); ++i) { + out << " Relocation (Index " << i << "):" << std::endl; + out << " Type: " << section->relocation(i)->type() << std::endl; + out << " Symbol: " << section->relocation(i)->symbol()->name() << std::endl; + out << " Offset: " << section->relocation(i)->offset() << " Addend: " << section->relocation(i)->addend() << std::endl; + } + out << std::dec; + } + + void AmdHsaCode::PrintSymbol(std::ostream& out, Symbol* sym) + { + out << " Symbol " << sym->Name() << " (Index " << sym->Index() << "):" << std::endl; + if (sym->IsKernelSymbol() || sym->IsVariableSymbol()) { + out << " Section: " << sym->GetSection()->Name() << " "; + out << " Section Offset: " << sym->SectionOffset() << std::endl; + out << " VAddr: " << sym->VAddr() << " "; + out << " Size: " << sym->Size() << " "; + out << " Alignment: " << sym->Alignment() << std::endl; + out << " Kind: " << HsaSymbolKindToString(sym->Kind()) << " "; + out << " Linkage: " << HsaSymbolLinkageToString(sym->Linkage()) << " "; + out << " Definition: " << (sym->IsDefinition() ? "TRUE" : "FALSE") << std::endl; + } + if (sym->IsVariableSymbol()) { + out << " Allocation: " << HsaVariableAllocationToString(sym->Allocation()) << " "; + out << " Segment: " << HsaVariableSegmentToString(sym->Segment()) << " "; + out << " Constant: " << (sym->IsConst() ? "TRUE" : "FALSE") << std::endl; + } + out << std::dec; + } + + void AmdHsaCode::PrintMachineCode(std::ostream& out, KernelSymbol* sym) + { + assert(HsaText()); + amd_kernel_code_t kernel_code; + HsaText()->getData(sym->SectionOffset(), &kernel_code, sizeof(amd_kernel_code_t)); + + out << "AMD Kernel Code for " << sym->Name() << ": " << std::endl << std::dec; + PrintAmdKernelCode(out, &kernel_code); + out << std::endl; + + std::vector isa(HsaText()->size(), 0); + HsaText()->getData(0, isa.data(), HsaText()->size()); + uint64_t isa_offset = sym->SectionOffset() + kernel_code.kernel_code_entry_byte_offset; + + out << "Disassembly for " << sym->Name() << ": " << std::endl; + PrintDisassembly(out, isa.data(), HsaText()->size(), isa_offset); + out << std::endl << std::dec; + } + + void AmdHsaCode::PrintDisassembly(std::ostream& out, const unsigned char *isa, size_t size, uint32_t isa_offset) + { + PrintRawData(out, isa, size); + out << std::dec; + } + + std::string AmdHsaCode::MangleSymbolName(const std::string& module_name, const std::string symbol_name) + { + if (module_name.empty()) { + return symbol_name; + } else { + return module_name + "::" + symbol_name; + } + } + + bool AmdHsaCode::ElfImageError() + { + out << img->output(); + return false; + } + + AmdHsaCode* AmdHsaCodeManager::FromHandle(hsa_code_object_t c) + { + CodeMap::iterator i = codeMap.find(c.handle); + if (i == codeMap.end()) { + AmdHsaCode* code = new AmdHsaCode(); + const void* buffer = reinterpret_cast(c.handle); + if (!code->InitAsBuffer(buffer, 0)) { + delete code; + return 0; + } + codeMap[c.handle] = code; + return code; + } + return i->second; + } + + bool AmdHsaCodeManager::Destroy(hsa_code_object_t c) + { + CodeMap::iterator i = codeMap.find(c.handle); + if (i == codeMap.end()) { + // Currently, we do not always create map entry for every code object buffer. + return true; + } + delete i->second; + codeMap.erase(i); + return true; + } + + bool AmdHsaCode::PullElfV2() + { + for (size_t i = 0; i < img->segmentCount(); ++i) { + Segment* s = img->segment(i); + if (s->type() == PT_LOAD) { + dataSegments.push_back(s); + } + } + for (size_t i = 0; i < img->sectionCount(); ++i) { + Section* sec = img->section(i); + if (!sec) { continue; } + if ((sec->type() == SHT_PROGBITS || sec->type() == SHT_NOBITS) && + !(sec->flags() & SHF_EXECINSTR)) { + dataSections.push_back(sec); + } else if (sec->type() == SHT_RELA) { + relocationSections.push_back(sec->asRelocationSection()); + } + if (sec->Name() == ".text") { + hsatext = sec; + } + } + for (size_t i = 0; i < img->symtab()->symbolCount(); ++i) { + amd::elf::Symbol* elfsym = img->symtab()->symbol(i); + Symbol* sym = 0; + switch (elfsym->type()) { + case STT_AMDGPU_HSA_KERNEL: { + amd::elf::Section* sec = elfsym->section(); + amd_kernel_code_t akc; + if (!sec) { + out << "Failed to find section for symbol " << elfsym->name() << std::endl; + return false; + } + if (!(sec->flags() & (SHF_ALLOC | SHF_EXECINSTR))) { + out << "Invalid code section for symbol " << elfsym->name() << std::endl; + return false; + } + if (!sec->getData(elfsym->value() - sec->addr(), &akc, sizeof(amd_kernel_code_t))) { + out << "Failed to get AMD Kernel Code for symbol " << elfsym->name() << std::endl; + return false; + } + sym = new KernelSymbolV2(elfsym, &akc); + break; + } + case STT_OBJECT: + case STT_COMMON: + sym = new VariableSymbolV2(elfsym); + break; + default: + break; // Skip unknown symbols. + } + if (sym) { symbols.push_back(sym); } + } + + return true; + } + + KernelSymbolV2::KernelSymbolV2(amd::elf::Symbol* elfsym_, const amd_kernel_code_t* akc) : + KernelSymbol(elfsym_, akc) { } +} +} +} diff --git a/runtime/hsa-runtime/libamdhsacode/amd_hsa_code_util.cpp b/runtime/hsa-runtime/libamdhsacode/amd_hsa_code_util.cpp new file mode 100644 index 0000000000..66955333de --- /dev/null +++ b/runtime/hsa-runtime/libamdhsacode/amd_hsa_code_util.cpp @@ -0,0 +1,1033 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#include "amd_hsa_code_util.hpp" +#include "libelf.h" +#include +#include +#include +#include +#include +#ifdef _WIN32 +#include +#include +#include +#else // _WIN32 +#include +#include +#include +#include +#include +#endif // _WIN32 +#include "Brig.h" + +namespace { +auto eq = " = "; + +std::ostream& attr1(std::ostream& out) +{ + out << " " << std::left << std::setw(60) << std::setfill(' '); + return out; +} + +std::ostream& attr2(std::ostream& out) +{ + out << " " << std::left << std::setw(58) << std::setfill(' '); + return out; +} +} // namespace anonymous + +namespace amd { +namespace hsa { +namespace common { + +bool IsAccessibleMemoryAddress(uint64_t address) +{ + if (0 == address) { + return false; + } +#if defined(_WIN32) || defined(_WIN64) + MEMORY_BASIC_INFORMATION memory_info; + if (!VirtualQuery(reinterpret_cast(address), &memory_info, sizeof(memory_info))) { + return false; + } + int32_t is_accessible = ((memory_info.Protect & PAGE_READONLY) || + (memory_info.Protect & PAGE_READWRITE) || + (memory_info.Protect & PAGE_WRITECOPY) || + (memory_info.Protect & PAGE_EXECUTE_READ) || + (memory_info.Protect & PAGE_EXECUTE_READWRITE) || + (memory_info.Protect & PAGE_EXECUTE_WRITECOPY)); + if (memory_info.Protect & PAGE_GUARD) { + is_accessible = 0; + } + if (memory_info.Protect & PAGE_NOACCESS) { + is_accessible = 0; + } + return is_accessible > 0; +#else + int32_t random_fd = 0; + ssize_t bytes_written = 0; + if (-1 == (random_fd = open("/dev/random", O_WRONLY))) { + return false; + } + bytes_written = write(random_fd, (void*)address, 1); + if (-1 == close(random_fd)) { + return false; + } + return bytes_written == 1; +#endif // _WIN32 || _WIN64 +} + +} + +std::string HsaSymbolKindToString(hsa_symbol_kind_t kind) +{ + switch (kind) { + case HSA_SYMBOL_KIND_VARIABLE: return "VARIABLE"; + case HSA_SYMBOL_KIND_INDIRECT_FUNCTION: return "INDIRECT_FUNCTION"; + case HSA_SYMBOL_KIND_KERNEL: return "KERNEL"; + default: return "UNKNOWN"; + } +} + +std::string HsaSymbolLinkageToString(hsa_symbol_linkage_t linkage) +{ + switch (linkage) { + case HSA_SYMBOL_LINKAGE_MODULE: return "MODULE"; + case HSA_SYMBOL_LINKAGE_PROGRAM: return "PROGRAM"; + default: return "UNKNOWN"; + } +} + +std::string HsaVariableAllocationToString(hsa_variable_allocation_t allocation) +{ + switch (allocation) { + case HSA_VARIABLE_ALLOCATION_AGENT: return "AGENT"; + case HSA_VARIABLE_ALLOCATION_PROGRAM: return "PROGRAM"; + default: return "UNKNOWN"; + } +} + +std::string HsaVariableSegmentToString(hsa_variable_segment_t segment) +{ + switch (segment) { + case HSA_VARIABLE_SEGMENT_GLOBAL: return "GLOBAL"; + case HSA_VARIABLE_SEGMENT_READONLY: return "READONLY"; + default: return "UNKNOWN"; + } +} + +std::string HsaProfileToString(hsa_profile_t profile) +{ + switch (profile) { + case HSA_PROFILE_BASE: return "BASE"; + case HSA_PROFILE_FULL: return "FULL"; + default: return "UNKNOWN"; + } +} + +std::string HsaMachineModelToString(hsa_machine_model_t model) +{ + switch (model) { + case HSA_MACHINE_MODEL_SMALL: return "SMALL"; + case HSA_MACHINE_MODEL_LARGE: return "LARGE"; + default: return "UNKNOWN"; + } +} + +std::string HsaFloatRoundingModeToString(hsa_default_float_rounding_mode_t mode) +{ + switch (mode) { + case HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT: return "DEFAULT"; + case HSA_DEFAULT_FLOAT_ROUNDING_MODE_ZERO: return "ZERO"; + case HSA_DEFAULT_FLOAT_ROUNDING_MODE_NEAR: return "NEAR"; + default: return "UNKNOWN"; + } +} + +std::string AmdMachineKindToString(amd_machine_kind16_t machine) +{ + switch (machine) { + case AMD_MACHINE_KIND_UNDEFINED: return "UNDEFINED"; + case AMD_MACHINE_KIND_AMDGPU: return "AMDGPU"; + default: return "UNKNOWN"; + } +} + +std::string AmdFloatRoundModeToString(amd_float_round_mode_t round_mode) +{ + switch (round_mode) { + case AMD_FLOAT_ROUND_MODE_NEAREST_EVEN: return "NEAREST_EVEN"; + case AMD_FLOAT_ROUND_MODE_PLUS_INFINITY: return "PLUS_INFINITY"; + case AMD_FLOAT_ROUND_MODE_MINUS_INFINITY: return "MINUS_INFINITY"; + case AMD_FLOAT_ROUND_MODE_ZERO: return "ZERO"; + default: return "UNKNOWN"; + } +} + +std::string AmdFloatDenormModeToString(amd_float_denorm_mode_t denorm_mode) +{ + switch (denorm_mode) { + case AMD_FLOAT_DENORM_MODE_FLUSH_SOURCE_OUTPUT: return "FLUSH_SOURCE_OUTPUT"; + case AMD_FLOAT_DENORM_MODE_FLUSH_OUTPUT: return "FLUSH_OUTPUT"; + case AMD_FLOAT_DENORM_MODE_FLUSH_SOURCE: return "FLUSH_SOURCE"; + case AMD_FLOAT_DENORM_MODE_NO_FLUSH: return "FLUSH_NONE"; + default: return "UNKNOWN"; + } +} + +std::string AmdSystemVgprWorkitemIdToString(amd_system_vgpr_workitem_id_t system_vgpr_workitem_id) +{ + switch (system_vgpr_workitem_id) { + case AMD_SYSTEM_VGPR_WORKITEM_ID_X: return "X"; + case AMD_SYSTEM_VGPR_WORKITEM_ID_X_Y: return "X, Y"; + case AMD_SYSTEM_VGPR_WORKITEM_ID_X_Y_Z: return "X, Y, Z"; + default: return "UNKNOWN"; + } +} + +std::string AmdElementByteSizeToString(amd_element_byte_size_t element_byte_size) +{ + switch (element_byte_size) { + case AMD_ELEMENT_BYTE_SIZE_2: return "WORD (2 bytes)"; + case AMD_ELEMENT_BYTE_SIZE_4: return "DWORD (4 bytes)"; + case AMD_ELEMENT_BYTE_SIZE_8: return "QWORD (8 bytes)"; + case AMD_ELEMENT_BYTE_SIZE_16: return "16 bytes"; + default: return "UNKNOWN"; + } +} + +std::string AmdExceptionKindToString(amd_exception_kind16_t exceptions) +{ + std::string e; + if (exceptions & AMD_EXCEPTION_KIND_INVALID_OPERATION) { + e += ", INVALID_OPERATON"; + exceptions &= ~AMD_EXCEPTION_KIND_INVALID_OPERATION; + } + if (exceptions & AMD_EXCEPTION_KIND_DIVISION_BY_ZERO) { + e += ", DIVISION_BY_ZERO"; + exceptions &= ~AMD_EXCEPTION_KIND_DIVISION_BY_ZERO; + } + if (exceptions & AMD_EXCEPTION_KIND_OVERFLOW) { + e += ", OVERFLOW"; + exceptions &= ~AMD_EXCEPTION_KIND_OVERFLOW; + } + if (exceptions & AMD_EXCEPTION_KIND_UNDERFLOW) { + e += ", UNDERFLOW"; + exceptions &= ~AMD_EXCEPTION_KIND_UNDERFLOW; + } + if (exceptions & AMD_EXCEPTION_KIND_INEXACT) { + e += ", INEXACT"; + exceptions &= ~AMD_EXCEPTION_KIND_INEXACT; + } + if (exceptions) { + e += ", UNKNOWN"; + } + if (!e.empty()) { + e = "[" + e.erase(0, 2) + "]"; + } + return e; +} + +std::string AmdPowerTwoToString(amd_powertwo8_t p) +{ + return std::to_string(1 << (unsigned) p); +} + +amdgpu_hsa_elf_segment_t AmdHsaElfSectionSegment(amdgpu_hsa_elf_section_t sec) +{ + switch (sec) { + case AMDGPU_HSA_RODATA_GLOBAL_PROGRAM: + case AMDGPU_HSA_DATA_GLOBAL_PROGRAM: + case AMDGPU_HSA_BSS_GLOBAL_PROGRAM: + return AMDGPU_HSA_SEGMENT_GLOBAL_PROGRAM; + case AMDGPU_HSA_RODATA_GLOBAL_AGENT: + case AMDGPU_HSA_DATA_GLOBAL_AGENT: + case AMDGPU_HSA_BSS_GLOBAL_AGENT: + return AMDGPU_HSA_SEGMENT_GLOBAL_AGENT; + case AMDGPU_HSA_RODATA_READONLY_AGENT: + case AMDGPU_HSA_DATA_READONLY_AGENT: + case AMDGPU_HSA_BSS_READONLY_AGENT: + return AMDGPU_HSA_SEGMENT_READONLY_AGENT; + default: + assert(false); return AMDGPU_HSA_SEGMENT_LAST; + } +} + +bool IsAmdHsaElfSectionROData(amdgpu_hsa_elf_section_t sec) +{ + switch (sec) { + case AMDGPU_HSA_RODATA_GLOBAL_PROGRAM: + case AMDGPU_HSA_RODATA_GLOBAL_AGENT: + case AMDGPU_HSA_RODATA_READONLY_AGENT: + default: + return false; + } +} + +std::string AmdHsaElfSegmentToString(amdgpu_hsa_elf_segment_t seg) +{ + switch (seg) { + case AMDGPU_HSA_SEGMENT_GLOBAL_PROGRAM: return "GLOBAL_PROGRAM"; + case AMDGPU_HSA_SEGMENT_GLOBAL_AGENT: return "GLOBAL_AGENT"; + case AMDGPU_HSA_SEGMENT_READONLY_AGENT: return "READONLY_AGENT"; + case AMDGPU_HSA_SEGMENT_CODE_AGENT: return "CODE_AGENT"; + default: return "UNKNOWN"; + } +} + +std::string AmdPTLoadToString(uint64_t type) +{ + if (PT_LOOS <= type && type < PT_LOOS + AMDGPU_HSA_SEGMENT_LAST) { + return AmdHsaElfSegmentToString((amdgpu_hsa_elf_segment_t) (type - PT_LOOS)); + } else { + return "UNKNOWN (" + std::to_string(type) + ")"; + } +} + +void PrintAmdKernelCode(std::ostream& out, const amd_kernel_code_t *akc) +{ + uint32_t is_debug_enabled = AMD_HSA_BITS_GET(akc->kernel_code_properties, AMD_KERNEL_CODE_PROPERTIES_IS_DEBUG_ENABLED); + + out << attr1 << "amd_kernel_code_version_major" << eq + << akc->amd_kernel_code_version_major + << std::endl; + out << attr1 << "amd_kernel_code_version_minor" << eq + << akc->amd_kernel_code_version_minor + << std::endl; + out << attr1 << "amd_machine_kind" << eq + << AmdMachineKindToString(akc->amd_machine_kind) + << std::endl; + out << attr1 << "amd_machine_version_major" << eq + << (uint32_t)akc->amd_machine_version_major + << std::endl; + out << attr1 << "amd_machine_version_minor" << eq + << (uint32_t)akc->amd_machine_version_minor + << std::endl; + out << attr1 << "amd_machine_version_stepping" << eq + << (uint32_t)akc->amd_machine_version_stepping + << std::endl; + out << attr1 << "kernel_code_entry_byte_offset" << eq + << akc->kernel_code_entry_byte_offset + << std::endl; + if (akc->kernel_code_prefetch_byte_offset) { + out << attr1 << "kernel_code_prefetch_byte_offset" << eq + << akc->kernel_code_prefetch_byte_offset + << std::endl; + } + if (akc->kernel_code_prefetch_byte_size) { + out << attr1 << "kernel_code_prefetch_byte_size" << eq + << akc->kernel_code_prefetch_byte_size + << std::endl; + } + out << attr1 << "max_scratch_backing_memory_byte_size" << eq + << akc->max_scratch_backing_memory_byte_size + << std::endl; + PrintAmdComputePgmRsrcOne(out, akc->compute_pgm_rsrc1); + PrintAmdComputePgmRsrcTwo(out, akc->compute_pgm_rsrc2); + PrintAmdKernelCodeProperties(out, akc->kernel_code_properties); + if (akc->workitem_private_segment_byte_size) { + out << attr1 << "workitem_private_segment_byte_size" << eq + << akc->workitem_private_segment_byte_size + << std::endl; + } + if (akc->workgroup_group_segment_byte_size) { + out << attr1 << "workgroup_group_segment_byte_size" << eq + << akc->workgroup_group_segment_byte_size + << std::endl; + } + if (akc->gds_segment_byte_size) { + out << attr1 << "gds_segment_byte_size" << eq + << akc->gds_segment_byte_size + << std::endl; + } + if (akc->kernarg_segment_byte_size) { + out << attr1 << "kernarg_segment_byte_size" << eq + << akc->kernarg_segment_byte_size + << std::endl; + } + if (akc->workgroup_fbarrier_count) { + out << attr1 << "workgroup_fbarrier_count" << eq + << akc->workgroup_fbarrier_count + << std::endl; + } + out << attr1 << "wavefront_sgpr_count" << eq + << (uint32_t)akc->wavefront_sgpr_count + << std::endl; + out << attr1 << "workitem_vgpr_count" << eq + << (uint32_t)akc->workitem_vgpr_count + << std::endl; + if (akc->reserved_vgpr_count > 0) { + out << attr1 << "reserved_vgpr_first" << eq + << (uint32_t)akc->reserved_vgpr_first + << std::endl; + out << attr1 << "reserved_vgpr_count" << eq + << (uint32_t)akc->reserved_vgpr_count + << std::endl; + } + if (akc->reserved_sgpr_count > 0) { + out << attr1 << "reserved_sgpr_first" << eq + << (uint32_t)akc->reserved_sgpr_first + << std::endl; + out << attr1 << "reserved_sgpr_count" << eq + << (uint32_t)akc->reserved_sgpr_count + << std::endl; + } + if (is_debug_enabled && (akc->debug_wavefront_private_segment_offset_sgpr != uint16_t(-1))) { + out << attr1 << "debug_wavefront_private_segment_offset_sgpr" << eq + << (uint32_t)akc->debug_wavefront_private_segment_offset_sgpr + << std::endl; + } + if (is_debug_enabled && (akc->debug_private_segment_buffer_sgpr != uint16_t(-1))) { + out << attr1 << "debug_private_segment_buffer_sgpr" << eq + << (uint32_t)akc->debug_private_segment_buffer_sgpr + << ":" + << (uint32_t)(akc->debug_private_segment_buffer_sgpr + 3) + << std::endl; + } + if (akc->kernarg_segment_alignment) { + out << attr1 << "kernarg_segment_alignment" << eq + << AmdPowerTwoToString(akc->kernarg_segment_alignment) + << " (" << (uint32_t) akc->kernarg_segment_alignment << ")" + << std::endl; + } + if (akc->group_segment_alignment) { + out << attr1 << "group_segment_alignment" << eq + << AmdPowerTwoToString(akc->group_segment_alignment) + << " (" << (uint32_t) akc->group_segment_alignment << ")" + << std::endl; + } + if (akc->private_segment_alignment) { + out << attr1 << "private_segment_alignment" << eq + << AmdPowerTwoToString(akc->private_segment_alignment) + << " (" << (uint32_t) akc->private_segment_alignment << ")" + << std::endl; + } + out << attr1 << "wavefront_size" << eq + << AmdPowerTwoToString(akc->wavefront_size) + << " (" << (uint32_t) akc->wavefront_size << ")" + << std::endl; + PrintAmdControlDirectives(out, akc->control_directives); +} + +void PrintAmdComputePgmRsrcOne(std::ostream& out, amd_compute_pgm_rsrc_one32_t compute_pgm_rsrc1) +{ + out << " COMPUTE_PGM_RSRC1 (0x" << std::hex << std::setw(8) << std::setfill('0') << compute_pgm_rsrc1 << "):" << std::endl; + out << std::dec; + + uint32_t granulated_workitem_vgpr_count = AMD_HSA_BITS_GET(compute_pgm_rsrc1, AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WORKITEM_VGPR_COUNT); + out << attr2 << "granulated_workitem_vgpr_count" << eq + << granulated_workitem_vgpr_count + << std::endl; + uint32_t granulated_wavefront_sgpr_count = AMD_HSA_BITS_GET(compute_pgm_rsrc1, AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WAVEFRONT_SGPR_COUNT); + out << attr2 << "granulated_wavefront_sgpr_count" << eq + << granulated_wavefront_sgpr_count + << std::endl; + uint32_t priority = AMD_HSA_BITS_GET(compute_pgm_rsrc1, AMD_COMPUTE_PGM_RSRC_ONE_PRIORITY); + out << attr2 << "priority" << eq + << priority + << std::endl; + uint32_t float_round_mode_32 = AMD_HSA_BITS_GET(compute_pgm_rsrc1, AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_ROUND_MODE_32); + out << attr2 << "float_round_mode_32" << eq + << AmdFloatRoundModeToString((amd_float_round_mode_t)float_round_mode_32) + << std::endl; + uint32_t float_round_mode_16_64 = AMD_HSA_BITS_GET(compute_pgm_rsrc1, AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_ROUND_MODE_16_64); + out << attr2 << "float_round_mode_16_64" << eq + << AmdFloatRoundModeToString((amd_float_round_mode_t)float_round_mode_16_64) + << std::endl; + uint32_t float_denorm_mode_32 = AMD_HSA_BITS_GET(compute_pgm_rsrc1, AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_DENORM_MODE_32); + out << attr2 << "float_denorm_mode_32" << eq + << AmdFloatDenormModeToString((amd_float_denorm_mode_t)float_denorm_mode_32) + << std::endl; + uint32_t float_denorm_mode_16_64 = AMD_HSA_BITS_GET(compute_pgm_rsrc1, AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_DENORM_MODE_16_64); + out << attr2 << "float_denorm_mode_16_64" << eq + << AmdFloatDenormModeToString((amd_float_denorm_mode_t)float_denorm_mode_16_64) + << std::endl; + if (AMD_HSA_BITS_GET(compute_pgm_rsrc1, AMD_COMPUTE_PGM_RSRC_ONE_PRIV)) { + out << attr2 << "priv" << eq << "TRUE" + << std::endl; + } + if (AMD_HSA_BITS_GET(compute_pgm_rsrc1, AMD_COMPUTE_PGM_RSRC_ONE_ENABLE_DX10_CLAMP)) { + out << attr2 << "enable_dx10_clamp" << eq << "TRUE" + << std::endl; + } + if (AMD_HSA_BITS_GET(compute_pgm_rsrc1, AMD_COMPUTE_PGM_RSRC_ONE_DEBUG_MODE)) { + out << attr2 << "debug_mode" << eq << "TRUE" + << std::endl; + } + if (AMD_HSA_BITS_GET(compute_pgm_rsrc1, AMD_COMPUTE_PGM_RSRC_ONE_ENABLE_IEEE_MODE)) { + out << attr2 << "enable_ieee_mode" << eq << "TRUE" + << std::endl; + } + if (AMD_HSA_BITS_GET(compute_pgm_rsrc1, AMD_COMPUTE_PGM_RSRC_ONE_BULKY)) { + out << attr2 << "bulky" << eq << "TRUE" + << std::endl; + } + if (AMD_HSA_BITS_GET(compute_pgm_rsrc1, AMD_COMPUTE_PGM_RSRC_ONE_CDBG_USER)) { + out << attr2 << "cdbg_user" << eq << "TRUE" + << std::endl; + } +} + +void PrintAmdComputePgmRsrcTwo(std::ostream& out, amd_compute_pgm_rsrc_two32_t compute_pgm_rsrc2) +{ + out << " COMPUTE_PGM_RSRC2 (0x" << std::hex << std::setw(8) << std::setfill('0') << compute_pgm_rsrc2 << "):" << std::endl; + out << std::dec; + + if (AMD_HSA_BITS_GET(compute_pgm_rsrc2, AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_PRIVATE_SEGMENT_WAVE_BYTE_OFFSET)) { + out << attr2 << "enable_sgpr_private_segment_wave_byte_offset" << eq << "TRUE" + << std::endl; + } + uint32_t user_sgpr_count = AMD_HSA_BITS_GET(compute_pgm_rsrc2, AMD_COMPUTE_PGM_RSRC_TWO_USER_SGPR_COUNT); + out << attr2 << "user_sgpr_count" << eq + << user_sgpr_count + << std::endl; + if (AMD_HSA_BITS_GET(compute_pgm_rsrc2, AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_TRAP_HANDLER)) { + out << attr2 << "enable_trap_handler" << eq << "TRUE" + << std::endl; + } + if (AMD_HSA_BITS_GET(compute_pgm_rsrc2, AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_X)) { + out << attr2 << "enable_sgpr_workgroup_id_x" << eq << "TRUE" + << std::endl; + } + if (AMD_HSA_BITS_GET(compute_pgm_rsrc2, AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_Y)) { + out << attr2 << "enable_sgpr_workgroup_id_y" << eq << "TRUE" + << std::endl; + } + if (AMD_HSA_BITS_GET(compute_pgm_rsrc2, AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_Z)) { + out << attr2 << "enable_sgpr_workgroup_id_z" << eq << "TRUE" + << std::endl; + } + if (AMD_HSA_BITS_GET(compute_pgm_rsrc2, AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_INFO)) { + out << attr2 << "enable_sgpr_workgroup_info" << eq << "TRUE" + << std::endl; + } + uint32_t enable_vgpr_workitem_id = AMD_HSA_BITS_GET(compute_pgm_rsrc2, AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_VGPR_WORKITEM_ID); + out << attr2 << "enable_vgpr_workitem_id" << eq + << AmdSystemVgprWorkitemIdToString((amd_system_vgpr_workitem_id_t)enable_vgpr_workitem_id) + << std::endl; + if (AMD_HSA_BITS_GET(compute_pgm_rsrc2, AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_ADDRESS_WATCH)) { + out << attr2 << "enable_exception_address_watch" << eq << "TRUE" + << std::endl; + } + if (AMD_HSA_BITS_GET(compute_pgm_rsrc2, AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_MEMORY_VIOLATION)) { + out << attr2 << "enable_exception_memory_violation" << eq << "TRUE" + << std::endl; + } + uint32_t granulated_lds_size = AMD_HSA_BITS_GET(compute_pgm_rsrc2, AMD_COMPUTE_PGM_RSRC_TWO_GRANULATED_LDS_SIZE); + out << attr2 << "granulated_lds_size" << eq + << granulated_lds_size + << std::endl; + if (AMD_HSA_BITS_GET(compute_pgm_rsrc2, AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_INVALID_OPERATION)) { + out << attr2 << "enable_exception_ieee_754_fp_invalid_operation" << eq << "TRUE" + << std::endl; + } + if (AMD_HSA_BITS_GET(compute_pgm_rsrc2, AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_FP_DENORMAL_SOURCE)) { + out << attr2 << "enable_exception_fp_denormal_source" << eq << "TRUE" + << std::endl; + } + if (AMD_HSA_BITS_GET(compute_pgm_rsrc2, AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_DIVISION_BY_ZERO)) { + out << attr2 << "enable_exception_ieee_754_fp_division_by_zero" << eq << "TRUE" + << std::endl; + } + if (AMD_HSA_BITS_GET(compute_pgm_rsrc2, AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_OVERFLOW)) { + out << attr2 << "enable_exception_ieee_754_fp_overflow" << eq << "TRUE" + << std::endl; + } + if (AMD_HSA_BITS_GET(compute_pgm_rsrc2, AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_UNDERFLOW)) { + out << attr2 << "enable_exception_ieee_754_fp_underflow" << eq << "TRUE" + << std::endl; + } + if (AMD_HSA_BITS_GET(compute_pgm_rsrc2, AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_INEXACT)) { + out << attr2 << "enable_exception_ieee_754_fp_inexact" << eq << "TRUE" + << std::endl; + } + if (AMD_HSA_BITS_GET(compute_pgm_rsrc2, AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_INT_DIVISION_BY_ZERO)) { + out << attr2 << "enable_exception_int_division_by_zero" << eq << "TRUE" + << std::endl; + } +} + +void PrintAmdKernelCodeProperties(std::ostream& out, amd_kernel_code_properties32_t kernel_code_properties) +{ + out << " KERNEL_CODE_PROPERTIES (0x" << std::hex << std::setw(8) << std::setfill('0') << kernel_code_properties << "):" << std::endl; + out << std::dec; + + if (AMD_HSA_BITS_GET(kernel_code_properties, AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER)) { + out << attr2 << "enable_sgpr_private_segment_buffer" << eq << "TRUE" + << std::endl; + } + if (AMD_HSA_BITS_GET(kernel_code_properties, AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_DISPATCH_PTR)) { + out << attr2 << "enable_sgpr_dispatch_ptr" << eq << "TRUE" + << std::endl; + } + if (AMD_HSA_BITS_GET(kernel_code_properties, AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_QUEUE_PTR)) { + out << attr2 << "enable_sgpr_queue_ptr" << eq << "TRUE" + << std::endl; + } + if (AMD_HSA_BITS_GET(kernel_code_properties, AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_KERNARG_SEGMENT_PTR)) { + out << attr2 << "enable_sgpr_kernarg_segment_ptr" << eq << "TRUE" + << std::endl; + } + if (AMD_HSA_BITS_GET(kernel_code_properties, AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_DISPATCH_ID)) { + out << attr2 << "enable_sgpr_dispatch_id" << eq << "TRUE" + << std::endl; + } + if (AMD_HSA_BITS_GET(kernel_code_properties, AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_FLAT_SCRATCH_INIT)) { + out << attr2 << "enable_sgpr_flat_scratch_init" << eq << "TRUE" + << std::endl; + } + if (AMD_HSA_BITS_GET(kernel_code_properties, AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE)) { + out << attr2 << "enable_sgpr_private_segment_size" << eq << "TRUE" + << std::endl; + } + if (AMD_HSA_BITS_GET(kernel_code_properties, AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X)) { + out << attr2 << "enable_sgpr_grid_workgroup_count_x" << eq << "TRUE" + << std::endl; + } + if (AMD_HSA_BITS_GET(kernel_code_properties, AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y)) { + out << attr2 << "enable_sgpr_grid_workgroup_count_y" << eq << "TRUE" + << std::endl; + } + if (AMD_HSA_BITS_GET(kernel_code_properties, AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z)) { + out << attr2 << "enable_sgpr_grid_workgroup_count_z" << eq << "TRUE" + << std::endl; + } + if (AMD_HSA_BITS_GET(kernel_code_properties, AMD_KERNEL_CODE_PROPERTIES_ENABLE_ORDERED_APPEND_GDS)) { + out << attr2 << "enable_ordered_append_gds" << eq << "TRUE" + << std::endl; + } + uint32_t private_element_size = AMD_HSA_BITS_GET(kernel_code_properties, AMD_KERNEL_CODE_PROPERTIES_PRIVATE_ELEMENT_SIZE); + out << attr2 << "private_element_size" << eq + << AmdElementByteSizeToString((amd_element_byte_size_t)private_element_size) + << std::endl; + if (AMD_HSA_BITS_GET(kernel_code_properties, AMD_KERNEL_CODE_PROPERTIES_IS_PTR64)) { + out << attr2 << "is_ptr64" << eq << "TRUE" + << std::endl; + } + if (AMD_HSA_BITS_GET(kernel_code_properties, AMD_KERNEL_CODE_PROPERTIES_IS_DYNAMIC_CALLSTACK)) { + out << attr2 << "is_dynamic_callstack" << eq << "TRUE" + << std::endl; + } + if (AMD_HSA_BITS_GET(kernel_code_properties, AMD_KERNEL_CODE_PROPERTIES_IS_DEBUG_ENABLED)) { + out << attr2 << "is_debug_enabled" << eq << "TRUE" + << std::endl; + } + if (AMD_HSA_BITS_GET(kernel_code_properties, AMD_KERNEL_CODE_PROPERTIES_IS_XNACK_ENABLED)) { + out << attr2 << "is_xnack_enabled" << eq << "TRUE" + << std::endl; + } +} + +void PrintAmdControlDirectives(std::ostream& out, const amd_control_directives_t &control_directives) +{ + if (!control_directives.enabled_control_directives) { + return; + } + + out << " CONTROL_DIRECTIVES:" << std::endl; + + if (control_directives.enabled_control_directives & AMD_ENABLED_CONTROL_DIRECTIVE_ENABLE_BREAK_EXCEPTIONS) { + out << attr2 << "enable_break_exceptions" << eq + << AmdExceptionKindToString(control_directives.enable_break_exceptions).c_str() + << std::endl; + } + if (control_directives.enabled_control_directives & AMD_ENABLED_CONTROL_DIRECTIVE_ENABLE_DETECT_EXCEPTIONS) { + out << attr2 << "enable_detect_exceptions" << eq + << AmdExceptionKindToString(control_directives.enable_detect_exceptions).c_str() + << std::endl; + } + if (control_directives.enabled_control_directives & AMD_ENABLED_CONTROL_DIRECTIVE_MAX_DYNAMIC_GROUP_SIZE) { + out << attr2 << "max_dynamic_group_size" << eq + << control_directives.max_dynamic_group_size + << std::endl; + } + if (control_directives.enabled_control_directives & AMD_ENABLED_CONTROL_DIRECTIVE_MAX_FLAT_GRID_SIZE) { + out << attr2 << "max_flat_grid_size" << eq + << control_directives.max_flat_grid_size + << std::endl; + } + if (control_directives.enabled_control_directives & AMD_ENABLED_CONTROL_DIRECTIVE_MAX_FLAT_WORKGROUP_SIZE) { + out << attr2 << "max_flat_workgroup_size" << eq + << control_directives.max_flat_workgroup_size + << std::endl; + } + if (control_directives.enabled_control_directives & AMD_ENABLED_CONTROL_DIRECTIVE_REQUIRED_DIM) { + out << attr2 << "required_dim" << eq + << (uint32_t)control_directives.required_dim + << std::endl; + } + if (control_directives.enabled_control_directives & AMD_ENABLED_CONTROL_DIRECTIVE_REQUIRED_GRID_SIZE) { + out << attr2 << "required_grid_size" << eq + << "(" + << control_directives.required_grid_size[0] + << ", " + << control_directives.required_grid_size[1] + << ", " + << control_directives.required_grid_size[2] + << ")" + << std::endl; + } + if (control_directives.enabled_control_directives & AMD_ENABLED_CONTROL_DIRECTIVE_REQUIRED_WORKGROUP_SIZE) { + out << attr2 << "required_workgroup_size" << eq + << "(" + << control_directives.required_workgroup_size[0] + << ", " + << control_directives.required_workgroup_size[1] + << ", " + << control_directives.required_workgroup_size[2] + << ")" + << std::endl; + } + if (control_directives.enabled_control_directives & AMD_ENABLED_CONTROL_DIRECTIVE_REQUIRE_NO_PARTIAL_WORKGROUPS) { + out << attr2 << "require_no_partial_workgroups" << eq << "TRUE" + << std::endl; + } +} + +namespace code_options { + + std::ostream& space(std::ostream& out) + { + if (out.tellp()) { out << " "; } + return out; + } + + std::ostream& operator<<(std::ostream& out, const control_directive& d) + { + out << space << + "-hsa_control_directive:" << d.name << "="; + return out; + } + + const char *BrigExceptionString(BrigExceptions32_t e) + { + switch (e) { + case BRIG_EXCEPTIONS_INVALID_OPERATION: return "INVALID_OPERATION"; + case BRIG_EXCEPTIONS_DIVIDE_BY_ZERO: return "DIVIDE_BY_ZERO"; + case BRIG_EXCEPTIONS_OVERFLOW: return "OVERFLOW"; + case BRIG_EXCEPTIONS_INEXACT: return "INEXACT"; + default: + assert(false); return ""; + } + } + + std::ostream& operator<<(std::ostream& out, const exceptions_mask& e) + { + bool first = true; + for (BrigExceptions32_t be = BRIG_EXCEPTIONS_INVALID_OPERATION; be < BRIG_EXCEPTIONS_FIRST_USER_DEFINED; ++be) { + if (e.mask & be) { + if (first) { first = false; } else { out << ","; } + out << BrigExceptionString(be); + } + } + return out; + } + + std::ostream& operator<<(std::ostream& out, const control_directives& cd) + { + const hsa_ext_control_directives_t& d = cd.d; + uint64_t mask = d.control_directives_mask; + if (!mask) { return out; } + + if (mask & BRIG_CONTROL_ENABLEBREAKEXCEPTIONS) { + out << + control_directive("ENABLEBREAKEXCEPTIONS") << + exceptions_mask(d.break_exceptions_mask); + } + if (mask & BRIG_CONTROL_ENABLEDETECTEXCEPTIONS) { + out << + control_directive("ENABLEDETECTEXCEPTIONS") << + exceptions_mask(d.detect_exceptions_mask); + } + if (mask & BRIG_CONTROL_MAXDYNAMICGROUPSIZE) { + out << + control_directive("MAXDYNAMICGROUPSIZE") << + d.max_dynamic_group_size; + } + if (mask & BRIG_CONTROL_MAXFLATGRIDSIZE) { + out << + control_directive("MAXFLATGRIDSIZE") << + d.max_flat_grid_size; + } + if (mask & BRIG_CONTROL_MAXFLATWORKGROUPSIZE) { + out << + control_directive("MAXFLATWORKGROUPSIZE") << + d.max_flat_workgroup_size; + } + if (mask & BRIG_CONTROL_REQUIREDDIM) { + out << + control_directive("REQUIREDDIM") << + d.required_dim; + } + if (mask & BRIG_CONTROL_REQUIREDGRIDSIZE) { + out << + control_directive("REQUIREDGRIDSIZE") << + d.required_grid_size[0] << "," << + d.required_grid_size[1] << "," << + d.required_grid_size[2]; + } + if (mask & BRIG_CONTROL_REQUIREDWORKGROUPSIZE) { + out << + control_directive("REQUIREDWORKGROUPSIZE") << + d.required_workgroup_size.x << "," << + d.required_workgroup_size.y << "," << + d.required_workgroup_size.z; + } + return out; + } +} + +const char* hsaerr2str(hsa_status_t status) { + switch ((unsigned) status) { + case HSA_STATUS_SUCCESS: + return + "HSA_STATUS_SUCCESS: The function has been executed successfully."; + case HSA_STATUS_INFO_BREAK: + return + "HSA_STATUS_INFO_BREAK: A traversal over a list of " + "elements has been interrupted by the application before " + "completing."; + case HSA_STATUS_ERROR: + return "HSA_STATUS_ERROR: A generic error has occurred."; + case HSA_STATUS_ERROR_INVALID_ARGUMENT: + return + "HSA_STATUS_ERROR_INVALID_ARGUMENT: One of the actual " + "arguments does not meet a precondition stated in the " + "documentation of the corresponding formal argument."; + case HSA_STATUS_ERROR_INVALID_QUEUE_CREATION: + return + "HSA_STATUS_ERROR_INVALID_QUEUE_CREATION: The requested " + "queue creation is not valid."; + case HSA_STATUS_ERROR_INVALID_ALLOCATION: + return + "HSA_STATUS_ERROR_INVALID_ALLOCATION: The requested " + "allocation is not valid."; + case HSA_STATUS_ERROR_INVALID_AGENT: + return + "HSA_STATUS_ERROR_INVALID_AGENT: The agent is invalid."; + case HSA_STATUS_ERROR_INVALID_REGION: + return + "HSA_STATUS_ERROR_INVALID_REGION: The memory region is invalid."; + case HSA_STATUS_ERROR_INVALID_SIGNAL: + return + "HSA_STATUS_ERROR_INVALID_SIGNAL: The signal is invalid."; + case HSA_STATUS_ERROR_INVALID_QUEUE: + return + "HSA_STATUS_ERROR_INVALID_QUEUE: The queue is invalid."; + case HSA_STATUS_ERROR_OUT_OF_RESOURCES: + return + "HSA_STATUS_ERROR_OUT_OF_RESOURCES: The runtime failed to " + "allocate the necessary resources. This error may also " + "occur when the core runtime library needs to spawn " + "threads or create internal OS-specific events."; + case HSA_STATUS_ERROR_INVALID_PACKET_FORMAT: + return + "HSA_STATUS_ERROR_INVALID_PACKET_FORMAT: The AQL packet " + "is malformed."; + case HSA_STATUS_ERROR_RESOURCE_FREE: + return + "HSA_STATUS_ERROR_RESOURCE_FREE: An error has been " + "detected while releasing a resource."; + case HSA_STATUS_ERROR_NOT_INITIALIZED: + return + "HSA_STATUS_ERROR_NOT_INITIALIZED: An API other than " + "hsa_init has been invoked while the reference count of " + "the HSA runtime is zero."; + case HSA_STATUS_ERROR_REFCOUNT_OVERFLOW: + return + "HSA_STATUS_ERROR_REFCOUNT_OVERFLOW: The maximum " + "reference count for the object has been reached."; + case HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS: + return + "HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS: The arguments passed to " + "a functions are not compatible."; + case HSA_STATUS_ERROR_INVALID_INDEX: + return "The index is invalid."; + case HSA_STATUS_ERROR_INVALID_ISA: + return "The instruction set architecture is invalid."; + case HSA_STATUS_ERROR_INVALID_CODE_OBJECT: + return "The code object is invalid."; + case HSA_STATUS_ERROR_INVALID_EXECUTABLE: + return "The executable is invalid."; + case HSA_STATUS_ERROR_FROZEN_EXECUTABLE: + return "The executable is frozen."; + case HSA_STATUS_ERROR_INVALID_SYMBOL_NAME: + return "There is no symbol with the given name."; + case HSA_STATUS_ERROR_VARIABLE_ALREADY_DEFINED: + return "The variable is already defined."; + case HSA_STATUS_ERROR_VARIABLE_UNDEFINED: + return "The variable is undefined."; + case HSA_EXT_STATUS_ERROR_INVALID_PROGRAM: + return + "HSA_EXT_STATUS_ERROR_INVALID_PROGRAM: Invalid program"; + case HSA_EXT_STATUS_ERROR_INVALID_MODULE: + return "HSA_EXT_STATUS_ERROR_INVALID_MODULE: Invalid module"; + case HSA_EXT_STATUS_ERROR_INCOMPATIBLE_MODULE: + return + "HSA_EXT_STATUS_ERROR_INCOMPATIBLE_MODULE: Incompatible module"; + case HSA_EXT_STATUS_ERROR_MODULE_ALREADY_INCLUDED: + return + "HSA_EXT_STATUS_ERROR_MODULE_ALREADY_INCLUDED: Module already " + "included"; + case HSA_EXT_STATUS_ERROR_SYMBOL_MISMATCH: + return + "HSA_EXT_STATUS_ERROR_SYMBOL_MISMATCH: Symbol mismatch"; + case HSA_EXT_STATUS_ERROR_FINALIZATION_FAILED: + return + "HSA_EXT_STATUS_ERROR_FINALIZATION_FAILED: Finalization failed"; + case HSA_EXT_STATUS_ERROR_DIRECTIVE_MISMATCH: + return + "HSA_EXT_STATUS_ERROR_DIRECTIVE_MISMATCH: Directive mismatch"; + default: + return + "Unknown HSA status"; + } +} + +bool ReadFileIntoBuffer(const std::string& filename, std::vector& buffer) +{ + std::ifstream file(filename, std::ios::binary); + if (!file) { return false; } + file.seekg(0, std::ios::end); + std::streamsize size = file.tellg(); + file.seekg(0, std::ios::beg); + + buffer.resize((size_t) size); + if (!file.read(buffer.data(), size)) { return false; } + return true; +} + +#ifndef _WIN32 +#define _tempnam tempnam +#define _close close +#define _getpid getpid +#define _open open +#endif // _WIN32 + +int OpenTempFile(const char* prefix) +{ + unsigned c = 0; + std::string tname = prefix; + tname += "_"; + tname += std::to_string(_getpid()); + tname += "_"; + while (c++ < 20) { // Loop because several threads can generate same filename. +#ifdef _WIN32 + char dir[MAX_PATH+1]; + if (!GetTempPath(sizeof(dir), dir)) { return -1; } +#else // _WIN32 + char *dir = NULL; +#endif // _WIN32 + char *name = _tempnam(dir, tname.c_str()); + if (!name) { return -1; } +#ifdef _WIN32 + HANDLE h = CreateFile( + name, + GENERIC_READ | GENERIC_WRITE, + 0, // No sharing + NULL, + CREATE_NEW, + FILE_ATTRIBUTE_TEMPORARY | FILE_FLAG_DELETE_ON_CLOSE, + NULL); + free(name); + if (h == INVALID_HANDLE_VALUE) { continue; } + return _open_osfhandle((intptr_t)h, 0); +#else // _WIN32 + int d = _open(name, O_RDWR | O_CREAT | O_EXCL, S_IRUSR | S_IWUSR); + if (d < 0) { free(name); continue; } + if (unlink(name) < 0) { free(name); _close(d); return -1; } + free(name); + return d; +#endif // _WIN32 + } + return -1; +} + +void CloseTempFile(int fd) +{ + _close(fd); +} + +const char * CommentTopCallBack(void *ctx, int type) { + static const char* amd_kernel_code_t_begin = "amd_kernel_code_t begin"; + static const char* amd_kernel_code_t_end = "amd_kernel_code_t end"; + static const char* isa_begin = "isa begin"; + switch(type) { + case COMMENT_AMD_KERNEL_CODE_T_BEGIN: + return amd_kernel_code_t_begin; + case COMMENT_AMD_KERNEL_CODE_T_END: + return amd_kernel_code_t_end; + case COMMENT_KERNEL_ISA_BEGIN: + return isa_begin; + default: + assert(false); + return ""; + } +} +const char * CommentRightCallBack(void *ctx, int type) { + return nullptr; +} + +uint32_t ParseInstructionOffset(const std::string& instruction) { + // instruction format: opcode op1, op2 ... // offset: binopcode + std::string::size_type n = instruction.find("//"); + assert(n != std::string::npos); + std::string comment = instruction.substr(n); + n = comment.find(':'); + assert(n != std::string::npos); + comment.erase(n); + assert(comment.size() > 3); + comment.erase(0, 3); + return strtoul(comment.c_str(), nullptr, 16); +} + +bool IsNotSpace(char c) { + return !isspace(static_cast(c)); +} + +void ltrim(std::string &str) { + str.erase(str.begin(), std::find_if(str.begin(), str.end(), IsNotSpace)); +} + +} +} diff --git a/runtime/hsa-runtime/libamdhsacode/amd_hsa_code_util.hpp b/runtime/hsa-runtime/libamdhsacode/amd_hsa_code_util.hpp new file mode 100644 index 0000000000..30d07e2df9 --- /dev/null +++ b/runtime/hsa-runtime/libamdhsacode/amd_hsa_code_util.hpp @@ -0,0 +1,193 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef AMD_HSA_CODE_UTIL_HPP_ +#define AMD_HSA_CODE_UTIL_HPP_ + +#include +#include +#include +#include +#ifdef _WIN32 +#include +#else // _WIN32 +#include +#endif // _WIN32 +#include "amd_hsa_kernel_code.h" +#include "amd_hsa_elf.h" +#include "hsa.h" +#include "hsa_ext_finalize.h" + +#define hsa_error(e) static_cast(e) + +#define release_assert(e) \ + if (!(e)) { \ + std::cerr << __FILE__ << ":"; \ + std::cerr << __LINE__ << ":"; \ + std::cerr << " Assertion `" << #e << "' failed." << std::endl; \ + std::abort(); \ + } \ + +namespace amd { +namespace hsa { + +std::string HsaSymbolKindToString(hsa_symbol_kind_t kind); +std::string HsaSymbolLinkageToString(hsa_symbol_linkage_t linkage); +std::string HsaVariableAllocationToString(hsa_variable_allocation_t allocation); +std::string HsaVariableSegmentToString(hsa_variable_segment_t segment); +std::string HsaProfileToString(hsa_profile_t profile); +std::string HsaMachineModelToString(hsa_machine_model_t model); +std::string HsaFloatRoundingModeToString(hsa_default_float_rounding_mode_t mode); +std::string AmdMachineKindToString(amd_machine_kind16_t machine); +std::string AmdFloatRoundModeToString(amd_float_round_mode_t round_mode); +std::string AmdFloatDenormModeToString(amd_float_denorm_mode_t denorm_mode); +std::string AmdSystemVgprWorkitemIdToString(amd_system_vgpr_workitem_id_t system_vgpr_workitem_id); +std::string AmdElementByteSizeToString(amd_element_byte_size_t element_byte_size); +std::string AmdExceptionKindToString(amd_exception_kind16_t exceptions); +std::string AmdPowerTwoToString(amd_powertwo8_t p); +amdgpu_hsa_elf_segment_t AmdHsaElfSectionSegment(amdgpu_hsa_elf_section_t sec); +bool IsAmdHsaElfSectionROData(amdgpu_hsa_elf_section_t sec); +std::string AmdHsaElfSegmentToString(amdgpu_hsa_elf_segment_t seg); +std::string AmdPTLoadToString(uint64_t type); + +void PrintAmdKernelCode(std::ostream& out, const amd_kernel_code_t *akc); +void PrintAmdComputePgmRsrcOne(std::ostream& out, amd_compute_pgm_rsrc_one32_t compute_pgm_rsrc1); +void PrintAmdComputePgmRsrcTwo(std::ostream& out, amd_compute_pgm_rsrc_two32_t compute_pgm_rsrc2); +void PrintAmdKernelCodeProperties(std::ostream& out, amd_kernel_code_properties32_t kernel_code_properties); +void PrintAmdControlDirectives(std::ostream& out, const amd_control_directives_t &control_directives); + +namespace code_options { + // Space between options (not at the beginning). + std::ostream& space(std::ostream& out); + + // Control directive option without value. + struct control_directive { + const char *name; + control_directive(const char* name_) : name(name_) { } + }; + std::ostream& operator<<(std::ostream& out, const control_directive& d); + + // Exceptions mask string. + struct exceptions_mask { + uint16_t mask; + exceptions_mask(uint16_t mask_) : mask(mask_) { } + }; + std::ostream& operator<<(std::ostream& out, const exceptions_mask& e); + + // Control directives options. + struct control_directives { + const hsa_ext_control_directives_t& d; + control_directives(const hsa_ext_control_directives_t& d_) : d(d_) { } + }; + std::ostream& operator<<(std::ostream& out, const control_directives& cd); +} + +const char* hsaerr2str(hsa_status_t status); +bool ReadFileIntoBuffer(const std::string& filename, std::vector& buffer); + +// Create new empty temporary file that will be deleted when closed. +int OpenTempFile(const char* prefix); +void CloseTempFile(int fd); + +// Helper comment types for isa disassembler +enum DumpIsaCommentType { + COMMENT_AMD_KERNEL_CODE_T_BEGIN = 1, + COMMENT_AMD_KERNEL_CODE_T_END, + COMMENT_KERNEL_ISA_BEGIN, +}; + +// Callbacks to create helper comments for isa disassembler +const char * CommentTopCallBack(void *ctx, int type); +const char * CommentRightCallBack(void *ctx, int type); + +// Parse disassembler instruction line to find offset +uint32_t ParseInstructionOffset(const std::string& instruction); + +// Trim whitespaces from start of string +void ltrim(std::string &str); + + +// Helper function that allocates an aligned memory. +inline void* +alignedMalloc(size_t size, size_t alignment) +{ +#if defined(_WIN32) + return ::_aligned_malloc(size, alignment); +#else + void * ptr = NULL; + alignment = (std::max)(alignment, sizeof(void*)); + if (0 == ::posix_memalign(&ptr, alignment, size)) { + return ptr; + } + return NULL; +#endif +} + +// Helper function that frees an aligned memory. +inline void +alignedFree(void *ptr) +{ +#if defined(_WIN32) + ::_aligned_free(ptr); +#else + free(ptr); +#endif +} + +inline uint64_t alignUp(uint64_t num, uint64_t align) +{ + assert(align); + assert((align & (align - 1)) == 0); + return (num + align - 1) & ~(align - 1); +} + +inline uint32_t alignUp(uint32_t num, uint32_t align) +{ + assert(align); + assert((align & (align - 1)) == 0); + return (num + align - 1) & ~(align - 1); +} + +} +} + +#endif // AMD_HSA_CODE_UTIL_HPP_ diff --git a/runtime/hsa-runtime/libamdhsacode/amd_hsa_locks.cpp b/runtime/hsa-runtime/libamdhsacode/amd_hsa_locks.cpp new file mode 100644 index 0000000000..7547697831 --- /dev/null +++ b/runtime/hsa-runtime/libamdhsacode/amd_hsa_locks.cpp @@ -0,0 +1,94 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#include "amd_hsa_locks.hpp" + +namespace amd { +namespace hsa { +namespace common { + +void ReaderWriterLock::ReaderLock() +{ + internal_lock_.lock(); + while (0 < writers_count_) { + readers_condition_.wait(internal_lock_); + } + readers_count_ += 1; + internal_lock_.unlock(); +} + +void ReaderWriterLock::ReaderUnlock() +{ + internal_lock_.lock(); + readers_count_ -= 1; + if (0 == readers_count_ && 0 < writers_waiting_) { + writers_condition_.notify_one(); + } + internal_lock_.unlock(); +} + +void ReaderWriterLock::WriterLock() +{ + internal_lock_.lock(); + writers_waiting_ += 1; + while (0 < readers_count_ || 0 < writers_count_) { + writers_condition_.wait(internal_lock_); + } + writers_count_ += 1; + writers_waiting_ -= 1; + internal_lock_.unlock(); +} + +void ReaderWriterLock::WriterUnlock() +{ + internal_lock_.lock(); + writers_count_ -= 1; + if (0 < writers_waiting_) { + writers_condition_.notify_one(); + } + readers_condition_.notify_all(); + internal_lock_.unlock(); +} + +} // namespace common +} // namespace hsa +} // namespace amd diff --git a/runtime/hsa-runtime/libamdhsacode/amd_hsa_locks.hpp b/runtime/hsa-runtime/libamdhsacode/amd_hsa_locks.hpp new file mode 100644 index 0000000000..1bfa1ad5d2 --- /dev/null +++ b/runtime/hsa-runtime/libamdhsacode/amd_hsa_locks.hpp @@ -0,0 +1,127 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef AMD_HSA_LOCKS_HPP +#define AMD_HSA_LOCKS_HPP + +#include +#include +#include + +namespace amd { +namespace hsa { +namespace common { + +template +class ReaderLockGuard final { +public: + explicit ReaderLockGuard(LockType &lock): + lock_(lock) + { + lock_.ReaderLock(); + } + + ~ReaderLockGuard() + { + lock_.ReaderUnlock(); + } + +private: + ReaderLockGuard(const ReaderLockGuard&); + ReaderLockGuard& operator=(const ReaderLockGuard&); + + LockType &lock_; +}; + +template +class WriterLockGuard final { +public: + explicit WriterLockGuard(LockType &lock): + lock_(lock) + { + lock_.WriterLock(); + } + + ~WriterLockGuard() + { + lock_.WriterUnlock(); + } + +private: + WriterLockGuard(const WriterLockGuard&); + WriterLockGuard& operator=(const WriterLockGuard&); + + LockType &lock_; +}; + +class ReaderWriterLock final { +public: + ReaderWriterLock(): + readers_count_(0), writers_count_(0), writers_waiting_(0) {} + + ~ReaderWriterLock() {} + + void ReaderLock(); + + void ReaderUnlock(); + + void WriterLock(); + + void WriterUnlock(); + +private: + ReaderWriterLock(const ReaderWriterLock&); + ReaderWriterLock& operator=(const ReaderWriterLock&); + + size_t readers_count_; + size_t writers_count_; + size_t writers_waiting_; + std::mutex internal_lock_; + std::condition_variable_any readers_condition_; + std::condition_variable_any writers_condition_; +}; + +} // namespace common +} // namespace hsa +} // namespace amd + +#endif // AMD_HSA_LOCKS_HPP diff --git a/runtime/hsa-runtime/libamdhsacode/amd_options.cpp b/runtime/hsa-runtime/libamdhsacode/amd_options.cpp new file mode 100644 index 0000000000..2c72f2d018 --- /dev/null +++ b/runtime/hsa-runtime/libamdhsacode/amd_options.cpp @@ -0,0 +1,340 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#include "amd_options.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace amd { +namespace options { + +//===----------------------------------------------------------------------===// +// StringFactory. // +//===----------------------------------------------------------------------===// + +std::string StringFactory::Flatten(const char **cstrs, + const uint32_t &cstrs_count, + const char &spacer) { + if (NULL == cstrs || 0 == cstrs_count) { + return std::string(); + } + + std::string flattened; + for (uint32_t i = 0; i < cstrs_count; ++i) { + if (NULL == cstrs[i]) { + return std::string(); + } + flattened += cstrs[i]; + if (i != (cstrs_count - 1)) { + flattened += spacer; + } + } + return flattened; +} + +std::list StringFactory::Tokenize(const char *cstr, + const char &delim) { + if (NULL == cstr) { + return std::list(); + } + + const std::string str = cstr; + size_t start = 0; + size_t end = 0; + + std::list tokens; + while ((end = str.find(delim, start)) != std::string::npos) { + if (start != end) { + tokens.push_back(str.substr(start, end - start)); + } + start = end + 1; + } + if (str.size() > start) { + tokens.push_back(str.substr(start)); + } + return tokens; +} + +std::string StringFactory::ToLower(const std::string& str) { + std::string lower(str.length(), ' '); + std::transform(str.begin(), str.end(), lower.begin(), ::tolower); + return lower; +} + +std::string StringFactory::ToUpper(const std::string& str) { + std::string upper(str.length(), ' '); + std::transform(str.begin(), str.end(), upper.begin(), ::toupper); + return upper; +} + +//===----------------------------------------------------------------------===// +// HelpPrinter, HelpStreambuf. // +//===----------------------------------------------------------------------===// + +HelpStreambuf::HelpStreambuf(std::ostream& stream) + : basicStream_(&stream), + basicBuf_(stream.rdbuf()), + wrapWidth_(0), + indentSize_(0), + atLineStart_(true), + lineWidth_(0) +{ + basicStream_->rdbuf(this); +} + +HelpStreambuf::int_type HelpStreambuf::overflow(HelpStreambuf::int_type ch) { + if (atLineStart_ && ch != '\n') { + std::string indent(indentSize_, ' '); + basicBuf_->sputn(indent.data(), indent.size()); + lineWidth_ = indentSize_; + atLineStart_ = false; + } else if (ch == '\n') { + atLineStart_ = true; + lineWidth_ = 0; + } + + if (wrapWidth_ > 0 && lineWidth_ == wrapWidth_) { + basicBuf_->sputc('\n'); + std::string indent(indentSize_, ' '); + basicBuf_->sputn(indent.data(), indent.size()); + lineWidth_ = indentSize_; + atLineStart_ = false; + } + + lineWidth_++; + return basicBuf_->sputc(ch); + } + +HelpPrinter& HelpPrinter::PrintUsage(const std::string& usage) { + sbuf_.IndentSize(0); + sbuf_.WrapWidth(0); + Stream() << usage; + if (usage.length() < USAGE_WIDTH) { + Stream() << std::string(USAGE_WIDTH - usage.length(), ' '); + } + Stream() << std::string(PADDING_WIDTH, ' '); + return *this; +} + +HelpPrinter& HelpPrinter::PrintDescription(const std::string& description) { + sbuf_.WrapWidth(USAGE_WIDTH + PADDING_WIDTH + DESCRIPTION_WIDTH); + sbuf_.IndentSize(USAGE_WIDTH + PADDING_WIDTH); + Stream() << description << std::endl; + sbuf_.IndentSize(0); + sbuf_.WrapWidth(0); + return *this; +} + +//===----------------------------------------------------------------------===// +// ChoiceOptioin. // +//===----------------------------------------------------------------------===// +ChoiceOption::ChoiceOption(const std::string& name, + const std::vector& choices, + const std::string& help, + std::ostream& error) + : OptionBase(name, help, error) { + for (const auto& choice: choices) { + choices_.insert(choice); + } + } + +bool ChoiceOption::ProcessTokens(std::list &tokens) { + assert(0 == name_.compare(tokens.front()) && "option name is mismatched"); + if (2 != tokens.size()) { + error() << "error: invalid option: \'" << name_ << '\'' << std::endl; + return false; + } + + tokens.pop_front(); + + if (0 == choices_.count(tokens.front())) { + error() << "error: invalid option: \'" << name_ << '\'' << std::endl; + return false; + } + + is_set_ = true; + value_ = tokens.front(); + tokens.pop_front(); + return true; +} + +void ChoiceOption::PrintHelp(HelpPrinter& printer) const { + std::string usage = "-" + name_ + "=["; + bool first = true; + for (const auto& choice: choices_) { + if (!first) { + usage += '|'; + } else { + first = false; + } + usage += choice; + } + usage += "]"; + printer.PrintUsage(usage).PrintDescription(help_); +} + +//===----------------------------------------------------------------------===// +// OptionParser. // +//===----------------------------------------------------------------------===// +std::vector::iterator +OptionParser::FindOption(const std::string& name) { + std::vector::iterator it = options_.begin(); + std::vector::iterator end = options_.end(); + for (; it != end; ++it) { + if ((*it)->name() == name) { + return it; + } + } + return end; +} + +bool OptionParser::AddOption(OptionBase *option) { + if (NULL == option || !option->IsValid()) { + return false; + } + if (FindOption(option->name()) != options_.end()) { + return false; + } + options_.push_back(option); + return true; +} + +const std::string& OptionParser::Unknown() const { + assert(collectUnknown_); + return unknownOptions_; +} + +bool OptionParser::ParseOptions(const char *options) { + std::list tokens_l1 = StringFactory::Tokenize(options, ' '); + if (0 == tokens_l1.size()) { + return true; + } + + std::list::iterator tokens_l1i = tokens_l1.begin(); + while (tokens_l1i != tokens_l1.end()) { + if ('-' == tokens_l1i->at(0)) { + std::list::iterator option_begin = tokens_l1i; + std::list tokens_l2; + do { + tokens_l2.push_back(*tokens_l1i); + tokens_l1i++; + } while (tokens_l1i != tokens_l1.end() && '-' != tokens_l1i->at(0)); + std::list::iterator option_end = tokens_l1i; + tokens_l2.front().erase(0, 1); + + if (1 == tokens_l2.size()) { + tokens_l2 = StringFactory::Tokenize(tokens_l2.front().c_str(), '='); + if (2 < tokens_l2.size()) { + if (collectUnknown_) { + unknownOptions_ += *tokens_l1i + " "; + continue; + } else { + error() << "error: invalid option format: \'" + << tokens_l2.front() << '\'' << std::endl; + Reset(); + return false; + } + } + } + + auto find_status = FindOption(tokens_l2.front()); + if (find_status == options_.end()) { + if (collectUnknown_) { + for (; option_begin != option_end; ++option_begin) { + unknownOptions_ += *option_begin + " "; + } + continue; + } else { + error() << "error: unknown option: \'" + << tokens_l2.front() << '\'' << std::endl; + Reset(); + return false; + } + } + + if (!(*find_status)->ProcessTokens(tokens_l2)) { + Reset(); + return false; + } + assert(0 == tokens_l2.size()); + } else { + if (collectUnknown_) { + unknownOptions_ += *tokens_l1i + " "; + } else { + error() << "error: unknown option: \'" + << *tokens_l1i << '\'' << std::endl; + Reset(); + return false; + } + } + } + + return true; +} + +void OptionParser::PrintHelp(std::ostream& out, const std::string& addition) const { + HelpPrinter printer(out); + for (const auto& option: options_) { + option->PrintHelp(printer); + } + out << addition << std::endl; +} + +void OptionParser::Reset() { + unknownOptions_.clear(); + for (auto &option : options_) { + option->Reset(); + } +} + +} // namespace options +} // namespace amd diff --git a/runtime/hsa-runtime/libamdhsacode/amd_options.hpp b/runtime/hsa-runtime/libamdhsacode/amd_options.hpp new file mode 100644 index 0000000000..25d812250b --- /dev/null +++ b/runtime/hsa-runtime/libamdhsacode/amd_options.hpp @@ -0,0 +1,430 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef AMD_OPTIONS_HPP +#define AMD_OPTIONS_HPP + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +namespace amd { +namespace options { + +//===----------------------------------------------------------------------===// +// StringFactory. // +//===----------------------------------------------------------------------===// + +class StringFactory final { +public: + static std::string Flatten(const char **cstrs, + const uint32_t &cstrs_count, + const char &spacer = '\0'); + + static std::list Tokenize(const char *cstr, const char &delim); + + static std::string ToLower(const std::string& str); + static std::string ToUpper(const std::string& str); +}; + +//===----------------------------------------------------------------------===// +// HelpPrinter, HelpStreambuf. // +//===----------------------------------------------------------------------===// + +class HelpStreambuf : public std::streambuf { +public: + explicit HelpStreambuf(std::ostream& stream); + + virtual ~HelpStreambuf() { + basicStream_->rdbuf(basicBuf_); + } + + void IndentSize(unsigned indent) { + assert(wrapWidth_ == 0 || indentSize_ < wrapWidth_); + indentSize_ = indent; + } + + void WrapWidth(unsigned wrap) { + assert(wrapWidth_ == 0 || indentSize_ < wrapWidth_); + wrapWidth_ = wrap; + } + +protected: + virtual int_type overflow(int_type ch) override; + +private: + std::ostream* basicStream_; + std::streambuf* basicBuf_; + + unsigned wrapWidth_; + unsigned indentSize_; + + bool atLineStart_; + unsigned lineWidth_; +}; + + +class HelpPrinter { +private: + static const unsigned USAGE_WIDTH = 30; + static const unsigned PADDING_WIDTH = 2; + static const unsigned DESCRIPTION_WIDTH = 50; + +public: + HelpPrinter& PrintUsage(const std::string& usage); + HelpPrinter& PrintDescription(const std::string& description); + + std::ostream& Stream() { return *out_; } + +private: + explicit HelpPrinter(std::ostream& out = std::cout) : out_(&out), sbuf_(*out_) {} + + /// @brief Not copy-constructible. + HelpPrinter(const HelpPrinter&); + /// @brief Not copy-assignable. + HelpPrinter& operator =(const HelpPrinter&); + + friend class OptionParser; + + std::ostream *out_; + HelpStreambuf sbuf_; +}; + +//===----------------------------------------------------------------------===// +// OptionBase. // +//===----------------------------------------------------------------------===// + +class OptionBase { +public: + virtual ~OptionBase() {} + + const std::string& name() const { + return name_; + } + const bool& is_set() const { + return is_set_; + } + + bool IsValid() const { + return 0 < name_.size(); + } + +protected: + explicit OptionBase(const std::string& name, + const std::string& help = "", + std::ostream &error = std::cerr) + : name_(name), + help_(help), + is_set_(false), + error_(&error) {} + + virtual void PrintHelp(HelpPrinter& printer) const = 0; + + const std::string name_; + const std::string help_; + bool is_set_; + + std::ostream &error() const { return *error_; } + +private: + /// @brief Not copy-constructible. + OptionBase(const OptionBase &ob); + /// @brief Not copy-assignable. + OptionBase& operator=(const OptionBase &ob); + + void Reset() { + is_set_ = false; + } + + virtual bool ProcessTokens(std::list &tokens) = 0; + + friend class OptionParser; + + mutable std::ostream *error_; +}; + + +//===----------------------------------------------------------------------===// +// Option. // +//===----------------------------------------------------------------------===// + +template +class Option final: public OptionBase { +public: + explicit Option(const std::string& name, + const std::string& help = "", + std::ostream& error = std::cerr): + OptionBase(name, help, error) {} + + ~Option() {} + + const std::list& values() const { + return values_; + } + +protected: + virtual void PrintHelp(HelpPrinter& printer) const override; + +private: + /// @brief Not copy-constructible. + Option(const Option &o); + /// @brief Not copy-assignable. + Option& operator=(const Option &o); + + bool ProcessTokens(std::list &tokens); + + std::list values_; +}; + +template +bool Option::ProcessTokens(std::list &tokens) { + assert(0 == name_.compare(tokens.front()) && "option name is mismatched"); + if (2 > tokens.size()) { + error() << "error: invalid option: \'" << name_ << '\'' << std::endl; + return false; + } + + is_set_ = true; + tokens.pop_front(); + + while (!tokens.empty()) { + std::istringstream token_stream(tokens.front()); + if (!token_stream.good()) { + error() << "error: invalid option: \'" << name_ << '\'' << std::endl; + return false; + } + + T value; + token_stream >> value; + + values_.push_back(value); + tokens.pop_front(); + } + return true; +} + +template +void Option::PrintHelp(HelpPrinter& printer) const { + printer.PrintUsage("-" + name_ + " [" + StringFactory::ToUpper(name_) + "s]") + .PrintDescription(help_); +} + +//===----------------------------------------------------------------------===// +// ValueOption. // +//===----------------------------------------------------------------------===// + +template +class ValueOption final: public OptionBase { +public: + explicit ValueOption(const std::string& name, + const std::string& help = "", + std::ostream& error = std::cerr): + OptionBase(name, help, error) {} + + ~ValueOption() {} + + const T& value() const { + return value_; + } + +protected: + void PrintHelp(HelpPrinter& printer) const override; + +private: + /// @brief Not copy-constructible. + ValueOption(const ValueOption &o); + /// @brief Not copy-assignable. + ValueOption& operator=(const ValueOption &o); + + bool ProcessTokens(std::list &tokens); + + T value_; +}; + +template +bool ValueOption::ProcessTokens(std::list &tokens) { + assert(0 == name_.compare(tokens.front()) && "option name is mismatched"); + if (2 != tokens.size()) { + error() << "error: invalid option: \'" << name_ << '\'' << std::endl; + return false; + } + + is_set_ = true; + tokens.pop_front(); + + std::istringstream token_stream(tokens.front()); + if (!token_stream.good()) { + error() << "error: invalid option: \'" << name_ << '\'' << std::endl; + return false; + } + token_stream >> value_; + tokens.pop_front(); + return true; +} + +template +void ValueOption::PrintHelp(HelpPrinter& printer) const { + printer.PrintUsage("-" + name_ + "=[VAL]") + .PrintDescription(help_); +} + +//===----------------------------------------------------------------------===// +// ChoiceOptioin. // +//===----------------------------------------------------------------------===// +class ChoiceOption final: public OptionBase { +public: + ChoiceOption(const std::string& name, + const std::vector& choices, + const std::string& help = "", + std::ostream& error = std::cerr); + + ~ChoiceOption() {} + + const std::string& value() const { + return value_; + } + +protected: + void PrintHelp(HelpPrinter& printer) const override; + +private: + /// @brief Not copy-constructible. + ChoiceOption(const ChoiceOption&); + /// @brief Not copy-assignable. + ChoiceOption& operator =(const ChoiceOption&); + + bool ProcessTokens(std::list &tokens); + + std::unordered_set choices_; + std::string value_; +}; + +//===----------------------------------------------------------------------===// +// Option. // +//===----------------------------------------------------------------------===// + +class NoArgOption final: public OptionBase { +public: + explicit NoArgOption(const std::string& name, + const std::string& help = "", + std::ostream& error = std::cerr): + OptionBase(name, help, error) {} + + ~NoArgOption() {} + +protected: + void PrintHelp(HelpPrinter& printer) const override { + printer.PrintUsage("-" + name_).PrintDescription(help_); + } + +private: + /// @brief Not copy-constructible. + NoArgOption(const NoArgOption &o); + /// @brief Not copy-assignable. + NoArgOption& operator=(const NoArgOption &o); + + bool ProcessTokens(std::list &tokens) { + assert(0 == name_.compare(tokens.front()) && "option name is mismatched"); + if (1 != tokens.size()) { + error() << "error: invalid option: \'" << name_ << '\'' << std::endl; + return false; + } + + is_set_ = true; + tokens.pop_front(); + return true; + } +}; + +//===----------------------------------------------------------------------===// +// OptionParser. // +//===----------------------------------------------------------------------===// + +class OptionParser final { +public: + explicit OptionParser(bool collectUnknown = false, std::ostream& error = std::cerr) + : collectUnknown_(collectUnknown), + error_(&error) {} + + ~OptionParser() {} + + bool AddOption(OptionBase *option); + + bool ParseOptions(const char *options); + + const std::string& Unknown() const; + void CollectUnknown(bool b) { collectUnknown_ = b; } + + void PrintHelp(std::ostream& out, const std::string& addition = "") const; + + void Reset(); + +private: + /// @brief Not copy-constructible. + OptionParser(const OptionParser &op); + /// @brief Not copy-assignable. + OptionParser& operator=(const OptionParser &op); + + std::ostream& error() { return *error_; } + + std::vector::iterator FindOption(const std::string& name); + + std::vector options_; + + std::string unknownOptions_; + bool collectUnknown_; + + std::ostream *error_; +}; + +} // namespace options +} // namespace amd + +#endif // AMD_OPTIONS_HPP diff --git a/runtime/hsa-runtime/loader/CMakeLists.txt b/runtime/hsa-runtime/loader/CMakeLists.txt new file mode 100644 index 0000000000..9c94952be3 --- /dev/null +++ b/runtime/hsa-runtime/loader/CMakeLists.txt @@ -0,0 +1,16 @@ +# +# loader library +# +# This file is expected to be included from top-level CMakeLists.txt. +# +# Dependencies: +# - Compiler definitions +# - amdhsacode library +# +# Defines: +# - amdhsaloader library and target include directories + +file(GLOB sources *.cpp *.hpp) +add_library(amdhsaloader ${sources}) +target_include_directories(amdhsaloader PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) +target_link_libraries(amdhsaloader amdhsacode) diff --git a/runtime/hsa-runtime/loader/executable.cpp b/runtime/hsa-runtime/loader/executable.cpp new file mode 100644 index 0000000000..b66e4ca1ac --- /dev/null +++ b/runtime/hsa-runtime/loader/executable.cpp @@ -0,0 +1,1175 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#include "executable.hpp" + +#include +#include +#include +#include +#include +#include "amd_hsa_elf.h" +#include "amd_hsa_kernel_code.h" +#include "amd_hsa_code.hpp" + +using namespace amd::hsa; +using namespace amd::hsa::common; + +namespace { + +bool IsBasePm4(hsa_profile_t profile) { + if (profile == HSA_PROFILE_FULL) { return false; } + char *emulate_aql = getenv("HSA_EMULATE_AQL"); + if (nullptr == emulate_aql) { return false; } + char *tools_lib = getenv("HSA_TOOLS_LIB"); + if (nullptr == tools_lib) { return false; } + return "1" == std::string(emulate_aql) && 0 != std::string(tools_lib).size(); +} + +} // namespace anonymous + +namespace amd { +namespace hsa { +namespace loader { + +Loader* Loader::Create(Context* context) +{ + return new AmdHsaCodeLoader(context); +} + +void Loader::Destroy(Loader *loader) +{ + delete loader; +} + +Executable* AmdHsaCodeLoader::CreateExecutable( + hsa_profile_t profile, const char *options) +{ + std::lock_guard lock(executables_mutex); + + executables.push_back(new ExecutableImpl(profile, context, executables.size())); + return executables.back(); +} + +void AmdHsaCodeLoader::DestroyExecutable(Executable *executable) +{ + std::lock_guard lock(executables_mutex); + executables[((ExecutableImpl*)executable)->id()] = nullptr; + delete executable; +} + +hsa_status_t AmdHsaCodeLoader::IterateExecutables( + hsa_status_t (*callback)( + hsa_executable_t executable, + void *data), + void *data) +{ + std::lock_guard lock(executables_mutex); + assert(callback); + + for (auto &exec : executables) { + hsa_status_t status = callback(Executable::Handle(exec), data); + if (status != HSA_STATUS_SUCCESS) { + return status; + } + } + + return HSA_STATUS_SUCCESS; +} + +//===----------------------------------------------------------------------===// +// SymbolImpl. // +//===----------------------------------------------------------------------===// + +bool SymbolImpl::GetInfo(hsa_symbol_info32_t symbol_info, void *value) { + static_assert( + (symbol_attribute32_t(HSA_CODE_SYMBOL_INFO_TYPE) == + symbol_attribute32_t(HSA_EXECUTABLE_SYMBOL_INFO_TYPE)), + "attributes are not compatible" + ); + static_assert( + (symbol_attribute32_t(HSA_CODE_SYMBOL_INFO_TYPE) == + symbol_attribute32_t(HSA_EXECUTABLE_SYMBOL_INFO_TYPE)), + "attributes are not compatible" + ); + static_assert( + (symbol_attribute32_t(HSA_CODE_SYMBOL_INFO_NAME_LENGTH) == + symbol_attribute32_t(HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH)), + "attributes are not compatible" + ); + static_assert( + (symbol_attribute32_t(HSA_CODE_SYMBOL_INFO_NAME) == + symbol_attribute32_t(HSA_EXECUTABLE_SYMBOL_INFO_NAME)), + "attributes are not compatible" + ); + static_assert( + (symbol_attribute32_t(HSA_CODE_SYMBOL_INFO_MODULE_NAME_LENGTH) == + symbol_attribute32_t(HSA_EXECUTABLE_SYMBOL_INFO_MODULE_NAME_LENGTH)), + "attributes are not compatible" + ); + static_assert( + (symbol_attribute32_t(HSA_CODE_SYMBOL_INFO_MODULE_NAME) == + symbol_attribute32_t(HSA_EXECUTABLE_SYMBOL_INFO_MODULE_NAME)), + "attributes are not compatible" + ); + static_assert( + (symbol_attribute32_t(HSA_CODE_SYMBOL_INFO_LINKAGE) == + symbol_attribute32_t(HSA_EXECUTABLE_SYMBOL_INFO_LINKAGE)), + "attributes are not compatible" + ); + static_assert( + (symbol_attribute32_t(HSA_CODE_SYMBOL_INFO_IS_DEFINITION) == + symbol_attribute32_t(HSA_EXECUTABLE_SYMBOL_INFO_IS_DEFINITION)), + "attributes are not compatible" + ); + + assert(value); + + switch (symbol_info) { + case HSA_CODE_SYMBOL_INFO_TYPE: { + *((hsa_symbol_kind_t*)value) = kind; + break; + } + case HSA_CODE_SYMBOL_INFO_NAME_LENGTH: { + std::string matter = ""; + + if (linkage == HSA_SYMBOL_LINKAGE_PROGRAM) { + assert(name.rfind(":") == std::string::npos); + matter = name; + } else { + assert(name.rfind(":") != std::string::npos); + matter = name.substr(name.rfind(":") + 1); + } + + *((uint32_t*)value) = matter.size() + 1; + break; + } + case HSA_CODE_SYMBOL_INFO_NAME: { + std::string matter = ""; + + if (linkage == HSA_SYMBOL_LINKAGE_PROGRAM) { + assert(name.rfind(":") == std::string::npos); + matter = name; + } else { + assert(name.rfind(":") != std::string::npos); + matter = name.substr(name.rfind(":") + 1); + } + + memset(value, 0x0, matter.size() + 1); + memcpy(value, matter.c_str(), matter.size()); + break; + } + case HSA_CODE_SYMBOL_INFO_MODULE_NAME_LENGTH: { + std::string matter = ""; + + if (linkage == HSA_SYMBOL_LINKAGE_PROGRAM) { + assert(name.find(":") == std::string::npos); + *((uint32_t*)value) = 0; + return true; + } + + assert(name.find(":") != std::string::npos); + matter = name.substr(0, name.find(":")); + + *((uint32_t*)value) = matter.size() + 1; + break; + } + case HSA_CODE_SYMBOL_INFO_MODULE_NAME: { + std::string matter = ""; + + if (linkage == HSA_SYMBOL_LINKAGE_PROGRAM) { + assert(name.find(":") == std::string::npos); + return true; + } + + assert(name.find(":") != std::string::npos); + matter = name.substr(0, name.find(":")); + + memset(value, 0x0, matter.size() + 1); + memcpy(value, matter.c_str(), matter.size()); + break; + } + case HSA_CODE_SYMBOL_INFO_LINKAGE: { + *((hsa_symbol_linkage_t*)value) = linkage; + break; + } + case HSA_CODE_SYMBOL_INFO_IS_DEFINITION: { + *((bool*)value) = is_definition; + break; + } + case HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT: + case HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS: { + if (!is_loaded) { + return false; + } + *((uint64_t*)value) = address; + break; + } + case HSA_EXECUTABLE_SYMBOL_INFO_AGENT: { + if (!is_loaded) { + return false; + } + *((hsa_agent_t*)value) = agent; + break; + } + default: { + return false; + } + } + + return true; +} + +//===----------------------------------------------------------------------===// +// KernelSymbol. // +//===----------------------------------------------------------------------===// + +bool KernelSymbol::GetInfo(hsa_symbol_info32_t symbol_info, void *value) { + static_assert( + (symbol_attribute32_t(HSA_CODE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE) == + symbol_attribute32_t(HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE)), + "attributes are not compatible" + ); + static_assert( + (symbol_attribute32_t(HSA_CODE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_ALIGNMENT) == + symbol_attribute32_t(HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_ALIGNMENT)), + "attributes are not compatible" + ); + static_assert( + (symbol_attribute32_t(HSA_CODE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE) == + symbol_attribute32_t(HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE)), + "attributes are not compatible" + ); + static_assert( + (symbol_attribute32_t(HSA_CODE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE) == + symbol_attribute32_t(HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE)), + "attributes are not compatible" + ); + static_assert( + (symbol_attribute32_t(HSA_CODE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK) == + symbol_attribute32_t(HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK)), + "attributes are not compatible" + ); + + assert(value); + + switch (symbol_info) { + case HSA_CODE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE: { + *((uint32_t*)value) = kernarg_segment_size; + break; + } + case HSA_CODE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_ALIGNMENT: { + *((uint32_t*)value) = kernarg_segment_alignment; + break; + } + case HSA_CODE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE: { + *((uint32_t*)value) = group_segment_size; + break; + } + case HSA_CODE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE: { + *((uint32_t*)value) = private_segment_size; + break; + } + case HSA_CODE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK: { + *((bool*)value) = is_dynamic_callstack; + break; + } + case HSA_EXT_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT_SIZE: { + *((uint32_t*)value) = size; + break; + } + case HSA_EXT_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT_ALIGN: { + *((uint32_t*)value) = alignment; + break; + } + default: { + return SymbolImpl::GetInfo(symbol_info, value); + } + } + + return true; +} + +//===----------------------------------------------------------------------===// +// VariableSymbol. // +//===----------------------------------------------------------------------===// + +bool VariableSymbol::GetInfo(hsa_symbol_info32_t symbol_info, void *value) { + static_assert( + (symbol_attribute32_t(HSA_CODE_SYMBOL_INFO_VARIABLE_ALLOCATION) == + symbol_attribute32_t(HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ALLOCATION)), + "attributes are not compatible" + ); + static_assert( + (symbol_attribute32_t(HSA_CODE_SYMBOL_INFO_VARIABLE_SEGMENT) == + symbol_attribute32_t(HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SEGMENT)), + "attributes are not compatible" + ); + static_assert( + (symbol_attribute32_t(HSA_CODE_SYMBOL_INFO_VARIABLE_ALIGNMENT) == + symbol_attribute32_t(HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ALIGNMENT)), + "attributes are not compatible" + ); + static_assert( + (symbol_attribute32_t(HSA_CODE_SYMBOL_INFO_VARIABLE_SIZE) == + symbol_attribute32_t(HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE)), + "attributes are not compatible" + ); + static_assert( + (symbol_attribute32_t(HSA_CODE_SYMBOL_INFO_VARIABLE_IS_CONST) == + symbol_attribute32_t(HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_IS_CONST)), + "attributes are not compatible" + ); + + switch (symbol_info) { + case HSA_CODE_SYMBOL_INFO_VARIABLE_ALLOCATION: { + *((hsa_variable_allocation_t*)value) = allocation; + break; + } + case HSA_CODE_SYMBOL_INFO_VARIABLE_SEGMENT: { + *((hsa_variable_segment_t*)value) = segment; + break; + } + case HSA_CODE_SYMBOL_INFO_VARIABLE_ALIGNMENT: { + *((uint32_t*)value) = alignment; + break; + } + case HSA_CODE_SYMBOL_INFO_VARIABLE_SIZE: { + *((uint32_t*)value) = size; + break; + } + case HSA_CODE_SYMBOL_INFO_VARIABLE_IS_CONST: { + *((bool*)value) = is_constant; + break; + } + default: { + return SymbolImpl::GetInfo(symbol_info, value); + } + } + + return true; +} + +bool LoadedCodeObjectImpl::GetInfo(amd_loaded_code_object_info_t attribute, void *value) +{ + assert(value); + + switch (attribute) { + case AMD_LOADED_CODE_OBJECT_INFO_ELF_IMAGE: + ((hsa_code_object_t*)value)->handle = reinterpret_cast(elf_data); + break; + case AMD_LOADED_CODE_OBJECT_INFO_ELF_IMAGE_SIZE: + *((size_t*)value) = elf_size; + break; + default: { + return false; + } + } + + return true; +} + +hsa_status_t LoadedCodeObjectImpl::IterateLoadedSegments( + hsa_status_t (*callback)( + amd_loaded_segment_t loaded_segment, + void *data), + void *data) +{ + assert(callback); + + for (auto &loaded_segment : loaded_segments) { + hsa_status_t status = callback(LoadedSegment::Handle(loaded_segment), data); + if (status != HSA_STATUS_SUCCESS) { + return status; + } + } + + return HSA_STATUS_SUCCESS; +} + +bool Segment::GetInfo(amd_loaded_segment_info_t attribute, void *value) +{ + assert(value); + + switch (attribute) { + case AMD_LOADED_SEGMENT_INFO_TYPE: { + *((amdgpu_hsa_elf_segment_t*)value) = segment; + break; + } + case AMD_LOADED_SEGMENT_INFO_ELF_BASE_ADDRESS: { + *((uint64_t*)value) = vaddr; + break; + } + case AMD_LOADED_SEGMENT_INFO_LOAD_BASE_ADDRESS: { + *((uint64_t*)value) = reinterpret_cast(this->Address(this->VAddr())); + break; + } + case AMD_LOADED_SEGMENT_INFO_SIZE: { + *((size_t*)value) = size; + break; + } + default: { + return false; + } + } + + return true; +} + +uint64_t Segment::Offset(uint64_t addr) +{ + assert(IsAddressInSegment(addr)); + return addr - vaddr; +} + +void* Segment::Address(uint64_t addr) +{ + return owner->context()->SegmentAddress(segment, agent, ptr, Offset(addr)); +} + +bool Segment::Freeze() +{ + return !frozen ? (frozen = owner->context()->SegmentFreeze(segment, agent, ptr, size)) : true; +} + +bool Segment::IsAddressInSegment(uint64_t addr) +{ + return vaddr <= addr && addr < vaddr + size; +} + +void Segment::Copy(uint64_t addr, const void* src, size_t size) +{ + // loader must do copies before freezing. + assert(!frozen); + + if (size > 0) { + owner->context()->SegmentCopy(segment, agent, ptr, Offset(addr), src, size); + } +} + +void Segment::Destroy() +{ + owner->context()->SegmentFree(segment, agent, ptr, size); +} + +//===----------------------------------------------------------------------===// +// ExecutableImpl. // +//===----------------------------------------------------------------------===// + +ExecutableImpl::ExecutableImpl(const hsa_profile_t &_profile, Context *context, size_t id) + : Executable() + , profile_(_profile) + , context_(context) + , id_(id) + , state_(HSA_EXECUTABLE_STATE_UNFROZEN) + , program_allocation_segment(nullptr) +{ +} + +ExecutableImpl::~ExecutableImpl() { + for (ExecutableObject* o : objects) { + o->Destroy(); + delete o; + } + objects.clear(); + + for (auto &symbol_entry : program_symbols_) { + delete symbol_entry.second; + } + for (auto &symbol_entry : agent_symbols_) { + delete symbol_entry.second; + } +} + +hsa_status_t ExecutableImpl::DefineProgramExternalVariable( + const char *name, void *address) +{ + WriterLockGuard writer_lock(rw_lock_); + assert(name); + assert(address); + + if (HSA_EXECUTABLE_STATE_FROZEN == state_) { + return HSA_STATUS_ERROR_FROZEN_EXECUTABLE; + } + + auto symbol_entry = program_symbols_.find(std::string(name)); + if (symbol_entry != program_symbols_.end()) { + return HSA_STATUS_ERROR_VARIABLE_ALREADY_DEFINED; + } + + program_symbols_.insert( + std::make_pair(std::string(name), + new VariableSymbol(true, + std::string(name), + HSA_SYMBOL_LINKAGE_PROGRAM, + true, + HSA_VARIABLE_ALLOCATION_PROGRAM, + HSA_VARIABLE_SEGMENT_GLOBAL, + 0, // TODO: size. + 0, // TODO: align. + false, // TODO: const. + true, + reinterpret_cast(address)))); + return HSA_STATUS_SUCCESS; +} + +hsa_status_t ExecutableImpl::DefineAgentExternalVariable( + const char *name, + hsa_agent_t agent, + hsa_variable_segment_t segment, + void *address) +{ + WriterLockGuard writer_lock(rw_lock_); + assert(name); + assert(address); + + if (HSA_EXECUTABLE_STATE_FROZEN == state_) { + return HSA_STATUS_ERROR_FROZEN_EXECUTABLE; + } + + auto symbol_entry = agent_symbols_.find(std::make_pair(std::string(name), agent)); + if (symbol_entry != agent_symbols_.end()) { + return HSA_STATUS_ERROR_VARIABLE_ALREADY_DEFINED; + } + + agent_symbols_.insert( + std::make_pair(std::make_pair(std::string(name), agent), + new VariableSymbol(true, + std::string(name), + HSA_SYMBOL_LINKAGE_PROGRAM, + true, + HSA_VARIABLE_ALLOCATION_AGENT, + segment, + 0, // TODO: size. + 0, // TODO: align. + false, // TODO: const. + true, + reinterpret_cast(address)))); + return HSA_STATUS_SUCCESS; +} + +Symbol* ExecutableImpl::GetSymbol( + const char *module_name, + const char *symbol_name, + hsa_agent_t agent, + int32_t call_convention) +{ + ReaderLockGuard reader_lock(rw_lock_); + return this->GetSymbolInternal(module_name, symbol_name, agent, call_convention); +} + +Symbol* ExecutableImpl::GetSymbolInternal( + const char *module_name, + const char *symbol_name, + hsa_agent_t agent, + int32_t call_convention) +{ + assert(module_name); + assert(symbol_name); + + std::string mangled_name = std::string(symbol_name); + if (mangled_name.empty()) { + return nullptr; + } + if (!std::string(module_name).empty()) { + mangled_name.insert(0, "::"); + mangled_name.insert(0, std::string(module_name)); + } + + auto program_symbol = program_symbols_.find(mangled_name); + if (program_symbol != program_symbols_.end()) { + return program_symbol->second; + } + auto agent_symbol = agent_symbols_.find(std::make_pair(mangled_name, agent)); + if (agent_symbol != agent_symbols_.end()) { + return agent_symbol->second; + } + return nullptr; +} + +hsa_status_t ExecutableImpl::IterateSymbols( + iterate_symbols_f callback, void *data) +{ + ReaderLockGuard reader_lock(rw_lock_); + assert(callback); + + for (auto &symbol_entry : program_symbols_) { + hsa_status_t hsc = + callback(Executable::Handle(this), Symbol::Handle(symbol_entry.second), data); + if (HSA_STATUS_SUCCESS != hsc) { + return hsc; + } + } + for (auto &symbol_entry : agent_symbols_) { + hsa_status_t hsc = + callback(Executable::Handle(this), Symbol::Handle(symbol_entry.second), data); + if (HSA_STATUS_SUCCESS != hsc) { + return hsc; + } + } + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t ExecutableImpl::IterateLoadedCodeObjects( + hsa_status_t (*callback)( + amd_loaded_code_object_t loaded_code_object, + void *data), + void *data) +{ + ReaderLockGuard reader_lock(rw_lock_); + assert(callback); + + for (auto &loaded_code_object : loaded_code_objects) { + hsa_status_t status = callback(LoadedCodeObject::Handle(loaded_code_object), data); + if (status != HSA_STATUS_SUCCESS) { + return status; + } + } + + return HSA_STATUS_SUCCESS; +} + +#define HSAERRCHECK(hsc) \ + if (hsc != HSA_STATUS_SUCCESS) { \ + assert(false); \ + return hsc; \ + } \ + + +hsa_status_t ExecutableImpl::GetInfo( + hsa_executable_info_t executable_info, void *value) +{ + ReaderLockGuard reader_lock(rw_lock_); + + assert(value); + + switch (executable_info) { + case HSA_EXECUTABLE_INFO_PROFILE: { + *((hsa_profile_t*)value) = profile_;; + break; + } + case HSA_EXECUTABLE_INFO_STATE: { + *((hsa_executable_state_t*)value) = state_; + break; + } + default: { + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + } + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t ExecutableImpl::LoadCodeObject( + hsa_agent_t agent, + hsa_code_object_t code_object, + const char *options, + amd_loaded_code_object_t *loaded_code_object) +{ + return LoadCodeObject(agent, code_object, 0, options, loaded_code_object); +} + +hsa_status_t ExecutableImpl::LoadCodeObject( + hsa_agent_t agent, + hsa_code_object_t code_object, + size_t code_object_size, + const char *options, + amd_loaded_code_object_t *loaded_code_object) +{ + WriterLockGuard writer_lock(rw_lock_); + if (HSA_EXECUTABLE_STATE_FROZEN == state_) { + return HSA_STATUS_ERROR_FROZEN_EXECUTABLE; + } + + code.reset(new code::AmdHsaCode()); + + if (!code->InitAsHandle(code_object)) { + return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; + } + + std::string codeIsa; + if (!code->GetNoteIsa(codeIsa)) { return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; } + + hsa_isa_t objectsIsa = context_->IsaFromName(codeIsa.c_str()); + if (!objectsIsa.handle) { return HSA_STATUS_ERROR_INVALID_ISA_NAME; } + + if (!context_->IsaSupportedByAgent(agent, objectsIsa)) { return HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS; } + + uint32_t majorVersion, minorVersion; + if (!code->GetNoteCodeObjectVersion(&majorVersion, &minorVersion)) { + return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; + } + + if (majorVersion != 1 && majorVersion != 2) { return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; } + + hsa_status_t status; + + objects.push_back(new LoadedCodeObjectImpl(this, agent, code->ElfData(), code->ElfSize())); + loaded_code_objects.push_back((LoadedCodeObjectImpl*)objects.back()); + + for (size_t i = 0; i < code->DataSegmentCount(); ++i) { + status = LoadSegment(agent, code->DataSegment(i), majorVersion, code->Machine()); + if (status != HSA_STATUS_SUCCESS) { return status; } + } + + for (size_t i = 0; i < code->SymbolCount(); ++i) { + status = LoadSymbol(agent, code->GetSymbol(i)); + if (status != HSA_STATUS_SUCCESS) { return status; } + } + + for (size_t i = 0; i < code->RelocationSectionCount(); ++i) { + status = LoadRelocationSection(agent, code->GetRelocationSection(i)); + if (status != HSA_STATUS_SUCCESS) { return status; } + } + + code.reset(); + if (nullptr != loaded_code_object) { *loaded_code_object = LoadedCodeObject::Handle(loaded_code_objects.back()); } + return HSA_STATUS_SUCCESS; +} + +hsa_status_t ExecutableImpl::LoadSegment(hsa_agent_t agent, code::Segment* s, + uint32_t majorVersion, uint16_t machine) +{ + if (majorVersion >= 2) + return LoadSegmentV2(agent, s, machine); + else + return LoadSegmentV1(agent, s); + +} + +hsa_status_t ExecutableImpl::LoadSegmentV1(hsa_agent_t agent, code::Segment* s) +{ + assert(s->type() < PT_LOOS + AMDGPU_HSA_SEGMENT_LAST); + if (s->memSize() == 0) + return HSA_STATUS_SUCCESS; + amdgpu_hsa_elf_segment_t segment = (amdgpu_hsa_elf_segment_t)(s->type() - PT_LOOS); + Segment *new_seg = nullptr; + bool need_alloc = true; + if (segment == AMDGPU_HSA_SEGMENT_GLOBAL_PROGRAM && nullptr != program_allocation_segment) { + new_seg = program_allocation_segment; + need_alloc = false; + } + if (need_alloc) { + void* ptr = context_->SegmentAlloc(segment, agent, s->memSize(), s->align(), true); + if (!ptr) { return HSA_STATUS_ERROR_OUT_OF_RESOURCES; } + new_seg = new Segment(this, agent, segment, ptr, s->memSize(), s->vaddr()); + new_seg->Copy(s->vaddr(), s->data(), s->imageSize()); + objects.push_back(new_seg); + + if (segment == AMDGPU_HSA_SEGMENT_GLOBAL_PROGRAM) { + program_allocation_segment = new_seg; + } + } + assert(new_seg); + loaded_code_objects.back()->LoadedSegments().push_back(new_seg); + return HSA_STATUS_SUCCESS; +} + +hsa_status_t ExecutableImpl::LoadSymbol(hsa_agent_t agent, code::Symbol* sym) +{ + if (sym->IsDeclaration()) { + return LoadDeclarationSymbol(agent, sym); + } else { + return LoadDefinitionSymbol(agent, sym); + } +} + +hsa_status_t ExecutableImpl::LoadDefinitionSymbol(hsa_agent_t agent, code::Symbol* sym) +{ + if (sym->IsAgent()) { + auto agent_symbol = agent_symbols_.find(std::make_pair(sym->Name(), agent)); + if (agent_symbol != agent_symbols_.end()) { + // TODO(spec): this is not spec compliant. + return HSA_STATUS_ERROR_VARIABLE_ALREADY_DEFINED; + } + } else { + auto program_symbol = program_symbols_.find(sym->Name()); + if (program_symbol != program_symbols_.end()) { + // TODO(spec): this is not spec compliant. + return HSA_STATUS_ERROR_VARIABLE_ALREADY_DEFINED; + } + } + + uint64_t address = SymbolAddress(agent, sym); + if (!address) { return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; } + + SymbolImpl *symbol = nullptr; + if (sym->IsVariableSymbol()) { + symbol = new VariableSymbol(true, + sym->Name(), + sym->Linkage(), + true, // sym->IsDefinition() + sym->Allocation(), + sym->Segment(), + sym->Size(), + sym->Alignment(), + sym->IsConst(), + false, + address); + } else if (sym->IsKernelSymbol()) { + amd_kernel_code_t akc; + sym->GetSection()->getData(sym->SectionOffset(), &akc, sizeof(akc)); + + uint32_t kernarg_segment_size = + uint32_t(akc.kernarg_segment_byte_size); + uint32_t kernarg_segment_alignment = + uint32_t(1 << akc.kernarg_segment_alignment); + uint32_t group_segment_size = + uint32_t(akc.workgroup_group_segment_byte_size); + uint32_t private_segment_size = + uint32_t(akc.workitem_private_segment_byte_size); + bool is_dynamic_callstack = + AMD_HSA_BITS_GET(akc.kernel_code_properties, AMD_KERNEL_CODE_PROPERTIES_IS_DYNAMIC_CALLSTACK) ? true : false; + + KernelSymbol *kernel_symbol = new KernelSymbol(true, + sym->Name(), + sym->Linkage(), + true, // sym->IsDefinition() + kernarg_segment_size, + kernarg_segment_alignment, + group_segment_size, + private_segment_size, + is_dynamic_callstack, + sym->Size(), + 256, + address); + kernel_symbol->debug_info.elf_raw = code->ElfData(); + kernel_symbol->debug_info.elf_size = code->ElfSize(); + kernel_symbol->debug_info.kernel_name = kernel_symbol->name.c_str(); + kernel_symbol->debug_info.owning_segment = (void*)SymbolSegment(agent, sym)->Address(sym->GetSection()->addr()); + kernel_symbol->debug_info.profile = profile_; + + // \todo kzhuravl 11/17/15 This is a temporary rt hack: needs to be + // removed when large bar is supported. + if (IsBasePm4(profile_)) { + kernel_symbol->debug_info.gpuva = kernel_symbol->address; + Segment *kernel_symbol_segment = SymbolSegment(agent, sym); + kernel_symbol->address = + (uint64_t) (uintptr_t) context_->SegmentHostAddress( + kernel_symbol_segment->ElfSegment(), + kernel_symbol_segment->Agent(), + kernel_symbol_segment->Ptr(), + kernel_symbol_segment->Offset(sym->VAddr())); + } + symbol = kernel_symbol; + + // \todo kzhuravl 10/15/15 This is a debugger backdoor: needs to be + // removed. + uint64_t target_address = sym->GetSection()->addr() + sym->SectionOffset() + ((size_t)(&((amd_kernel_code_t*)0)->runtime_loader_kernel_symbol)); + uint64_t source_value = (uint64_t) (uintptr_t) &kernel_symbol->debug_info; + SymbolSegment(agent, sym)->Copy(target_address, &source_value, sizeof(source_value)); + } else { + assert(!"Unexpected symbol type in LoadDefinitionSymbol"); + return HSA_STATUS_ERROR; + } + assert(symbol); + if (sym->IsAgent()) { + agent_symbols_.insert(std::make_pair(std::make_pair(sym->Name(), agent), symbol)); + } else { + program_symbols_.insert(std::make_pair(sym->Name(), symbol)); + } + return HSA_STATUS_SUCCESS; +} + +hsa_status_t ExecutableImpl::LoadDeclarationSymbol(hsa_agent_t agent, code::Symbol* sym) +{ + auto program_symbol = program_symbols_.find(sym->Name()); + if (program_symbol == program_symbols_.end()) { + auto agent_symbol = agent_symbols_.find(std::make_pair(sym->Name(), agent)); + if (agent_symbol == agent_symbols_.end()) { + // TODO(spec): this is not spec compliant. + return HSA_STATUS_ERROR_VARIABLE_UNDEFINED; + } + } + return HSA_STATUS_SUCCESS; +} + +uint64_t ExecutableImpl::SymbolAddress(hsa_agent_t agent, code::Symbol* sym) +{ + code::Section* sec = sym->GetSection(); + Segment* seg = SectionSegment(agent, sec); + return nullptr == seg ? 0 : (uint64_t) (uintptr_t) seg->Address(sym->VAddr()); +} + +uint64_t ExecutableImpl::SymbolAddress(hsa_agent_t agent, elf::Symbol* sym) +{ + elf::Section* sec = sym->section(); + Segment* seg = SectionSegment(agent, sec); + uint64_t vaddr = sec->addr() + sym->value(); + return nullptr == seg ? 0 : (uint64_t) (uintptr_t) seg->Address(vaddr); +} + +Segment* ExecutableImpl::SymbolSegment(hsa_agent_t agent, code::Symbol* sym) +{ + return SectionSegment(agent, sym->GetSection()); +} + +Segment* ExecutableImpl::SectionSegment(hsa_agent_t agent, code::Section* sec) +{ + for (Segment* seg : loaded_code_objects.back()->LoadedSegments()) { + if (seg->IsAddressInSegment(sec->addr())) { + return seg; + } + } + return 0; +} + +hsa_status_t ExecutableImpl::LoadRelocationSection(hsa_agent_t agent, code::RelocationSection* sec) +{ + hsa_status_t status; + for (size_t i = 0; i < sec->relocationCount(); ++i) { + status = LoadRelocation(agent, sec->relocation(i)); + if (status != HSA_STATUS_SUCCESS) { return status; } + } + return HSA_STATUS_SUCCESS; +} + +hsa_status_t ExecutableImpl::LoadRelocation(hsa_agent_t agent, code::Relocation* rel) +{ + hsa_status_t status; + amd::elf::Symbol* sym = rel->symbol(); + code::RelocationSection* rsec = rel->section(); + code::Section* sec = rsec->targetSection(); + Segment* rseg = SectionSegment(agent, sec); + size_t reladdr = sec->addr() + rel->offset(); + switch (rel->type()) { + case R_AMDGPU_32_LOW: + case R_AMDGPU_32_HIGH: + case R_AMDGPU_64: + { + uint64_t addr; + switch (sym->type()) { + case STT_OBJECT: + case STT_SECTION: + case STT_AMDGPU_HSA_KERNEL: + case STT_AMDGPU_HSA_INDIRECT_FUNCTION: + addr = SymbolAddress(agent, sym); + if (!addr) { return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; } + break; + case STT_COMMON: { + hsa_agent_t sagent = agent; + if (STA_AMDGPU_HSA_GLOBAL_PROGRAM == ELF64_ST_AMDGPU_ALLOCATION(sym->other())) { + sagent.handle = 0; + } + SymbolImpl* esym = (SymbolImpl*) GetSymbolInternal("", sym->name().c_str(), sagent, 0); + if (!esym) { return HSA_STATUS_ERROR_VARIABLE_UNDEFINED; } + addr = esym->address; + break; + } + default: + return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; + } + addr += rel->addend(); + + uint32_t addr32 = 0; + switch (rel->type()) { + case R_AMDGPU_32_HIGH: + addr32 = uint32_t((addr >> 32) & 0xFFFFFFFF); + rseg->Copy(reladdr, &addr32, sizeof(addr32)); + break; + case R_AMDGPU_32_LOW: + addr32 = uint32_t(addr & 0xFFFFFFFF); + rseg->Copy(reladdr, &addr32, sizeof(addr32)); + break; + case R_AMDGPU_64: + rseg->Copy(reladdr, &addr, sizeof(addr)); + break; + default: + return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; + } + break; + } + + case R_AMDGPU_INIT_SAMPLER: + { + if (STT_AMDGPU_HSA_METADATA != sym->type() || + SHT_PROGBITS != sym->section()->type() || + !(sym->section()->flags() & SHF_MERGE)) { + return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; + } + amdgpu_hsa_sampler_descriptor_t desc; + if (!sym->section()->getData(sym->value(), &desc, sizeof(desc))) { + return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; + } + if (AMDGPU_HSA_METADATA_KIND_INIT_SAMP != desc.kind) { + return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; + } + + hsa_ext_sampler_descriptor_t hsa_sampler_descriptor; + hsa_sampler_descriptor.coordinate_mode = + hsa_ext_sampler_coordinate_mode_t(desc.coord); + hsa_sampler_descriptor.filter_mode = + hsa_ext_sampler_filter_mode_t(desc.filter); + hsa_sampler_descriptor.address_mode = + hsa_ext_sampler_addressing_mode_t(desc.addressing); + + hsa_ext_sampler_t hsa_sampler = {0}; + status = context_->SamplerCreate(agent, &hsa_sampler_descriptor, &hsa_sampler); + if (status != HSA_STATUS_SUCCESS) { return status; } + assert(hsa_sampler.handle); + rseg->Copy(reladdr, &hsa_sampler, sizeof(hsa_sampler)); + break; + } + + case R_AMDGPU_INIT_IMAGE: + { + if (STT_AMDGPU_HSA_METADATA != sym->type() || + SHT_PROGBITS != sym->section()->type() || + !(sym->section()->flags() & SHF_MERGE)) { + return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; + } + + amdgpu_hsa_image_descriptor_t desc; + if (!sym->section()->getData(sym->value(), &desc, sizeof(desc))) { + return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; + } + if (AMDGPU_HSA_METADATA_KIND_INIT_ROIMG != desc.kind && + AMDGPU_HSA_METADATA_KIND_INIT_WOIMG != desc.kind && + AMDGPU_HSA_METADATA_KIND_INIT_RWIMG != desc.kind) { + return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; + } + + hsa_ext_image_format_t hsa_image_format; + hsa_image_format.channel_order = + hsa_ext_image_channel_order_t(desc.channel_order); + hsa_image_format.channel_type = + hsa_ext_image_channel_type_t(desc.channel_type); + + hsa_ext_image_descriptor_t hsa_image_descriptor; + hsa_image_descriptor.geometry = + hsa_ext_image_geometry_t(desc.geometry); + hsa_image_descriptor.width = size_t(desc.width); + hsa_image_descriptor.height = size_t(desc.height); + hsa_image_descriptor.depth = size_t(desc.depth); + hsa_image_descriptor.array_size = size_t(desc.array); + hsa_image_descriptor.format = hsa_image_format; + + hsa_access_permission_t hsa_image_permission = HSA_ACCESS_PERMISSION_RO; + switch (desc.kind) { + case AMDGPU_HSA_METADATA_KIND_INIT_ROIMG: { + hsa_image_permission = HSA_ACCESS_PERMISSION_RO; + break; + } + case AMDGPU_HSA_METADATA_KIND_INIT_WOIMG: { + hsa_image_permission = HSA_ACCESS_PERMISSION_WO; + break; + } + case AMDGPU_HSA_METADATA_KIND_INIT_RWIMG: { + hsa_image_permission = HSA_ACCESS_PERMISSION_RW; + break; + } + default: { + assert(false); + return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; + } + } + + hsa_ext_image_t hsa_image = {0}; + status = context_->ImageCreate(agent, hsa_image_permission, + &hsa_image_descriptor, + NULL, // TODO: image_data? + &hsa_image); + if (status != HSA_STATUS_SUCCESS) { return status; } + rseg->Copy(reladdr, &hsa_image, sizeof(hsa_image)); + break; + } + + default: + // Ignore + break; + } + return HSA_STATUS_SUCCESS; +} + +hsa_status_t ExecutableImpl::Freeze(const char *options) { + amd::hsa::common::WriterLockGuard writer_lock(rw_lock_); + if (HSA_EXECUTABLE_STATE_FROZEN == state_) { + return HSA_STATUS_ERROR_FROZEN_EXECUTABLE; + } + + for (auto &lco : loaded_code_objects) { + for (auto &ls : lco->LoadedSegments()) { + ls->Freeze(); + } + } + + state_ = HSA_EXECUTABLE_STATE_FROZEN; + return HSA_STATUS_SUCCESS; +} + +hsa_status_t ExecutableImpl::LoadSegmentV2(hsa_agent_t agent, code::Segment* s, uint16_t machine) +{ + amdgpu_hsa_elf_segment_t segment; + + if (s->memSize() == 0) + return HSA_STATUS_SUCCESS; + + // FIXME: Should support EM_HSA_VENDOR + if (machine == EM_AMDGPU) { + if (s->flags() & PF_X) + segment = AMDGPU_HSA_SEGMENT_CODE_AGENT; + else if (s->flags() & PF_W) + segment = AMDGPU_HSA_SEGMENT_GLOBAL_AGENT; + else { + assert (s->flags() & PF_R); + segment = AMDGPU_HSA_SEGMENT_READONLY_AGENT; + } + } else { // EM_HSA_SHARED + segment = AMDGPU_HSA_SEGMENT_GLOBAL_PROGRAM; + } + + void* ptr = context_->SegmentAlloc(segment, agent, s->memSize(), s->align(), true); + if (!ptr) { return HSA_STATUS_ERROR_OUT_OF_RESOURCES; } + + Segment *new_seg = new Segment(this, agent, segment, ptr, s->memSize(), s->vaddr()); + new_seg->Copy(s->vaddr(), s->data(), s->imageSize()); + objects.push_back(new_seg); + assert(new_seg); + + loaded_code_objects.back()->LoadedSegments().push_back(new_seg); + return HSA_STATUS_SUCCESS; +} + +} // namespace loader +} // namespace hsa +} // namespace amd diff --git a/runtime/hsa-runtime/loader/executable.hpp b/runtime/hsa-runtime/loader/executable.hpp new file mode 100644 index 0000000000..d4e19cfbb8 --- /dev/null +++ b/runtime/hsa-runtime/loader/executable.hpp @@ -0,0 +1,465 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef HSA_RUNTIME_CORE_LOADER_EXECUTABLE_HPP_ +#define HSA_RUNTIME_CORE_LOADER_EXECUTABLE_HPP_ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "hsa.h" +#include "hsa_ext_image.h" +#include "amd_hsa_loader.hpp" +#include "amd_hsa_code.hpp" +#include "amd_hsa_kernel_code.h" +#include "amd_hsa_locks.hpp" + +namespace amd { +namespace hsa { +namespace loader { + +class MemoryAddress; +class SymbolImpl; +class KernelSymbol; +class VariableSymbol; +class ExecutableImpl; + +//===----------------------------------------------------------------------===// +// SymbolImpl. // +//===----------------------------------------------------------------------===// + +typedef uint32_t symbol_attribute32_t; + +class SymbolImpl: public Symbol { +public: + virtual ~SymbolImpl() {} + + bool IsKernel() const { + return HSA_SYMBOL_KIND_KERNEL == kind; + } + bool IsVariable() const { + return HSA_SYMBOL_KIND_VARIABLE == kind; + } + + bool is_loaded; + hsa_symbol_kind_t kind; + std::string name; + hsa_symbol_linkage_t linkage; + bool is_definition; + uint64_t address; + hsa_agent_t agent; + +protected: + SymbolImpl(const bool &_is_loaded, + const hsa_symbol_kind_t &_kind, + const std::string &_name, + const hsa_symbol_linkage_t &_linkage, + const bool &_is_definition, + const uint64_t &_address = 0) + : is_loaded(_is_loaded) + , kind(_kind) + , name(_name) + , linkage(_linkage) + , is_definition(_is_definition) + , address(_address) {} + + virtual bool GetInfo(hsa_symbol_info32_t symbol_info, void *value); + +private: + SymbolImpl(const SymbolImpl &s); + SymbolImpl& operator=(const SymbolImpl &s); +}; + +//===----------------------------------------------------------------------===// +// KernelSymbol. // +//===----------------------------------------------------------------------===// + +class KernelSymbol final: public SymbolImpl { +public: + KernelSymbol(const bool &_is_loaded, + const std::string &_name, + const hsa_symbol_linkage_t &_linkage, + const bool &_is_definition, + const uint32_t &_kernarg_segment_size, + const uint32_t &_kernarg_segment_alignment, + const uint32_t &_group_segment_size, + const uint32_t &_private_segment_size, + const bool &_is_dynamic_callstack, + const uint32_t &_size, + const uint32_t &_alignment, + const uint64_t &_address = 0) + : SymbolImpl(_is_loaded, + HSA_SYMBOL_KIND_KERNEL, + _name, + _linkage, + _is_definition, + _address) + , kernarg_segment_size(_kernarg_segment_size) + , kernarg_segment_alignment(_kernarg_segment_alignment) + , group_segment_size(_group_segment_size) + , private_segment_size(_private_segment_size) + , is_dynamic_callstack(_is_dynamic_callstack) + , size(_size) + , alignment(_alignment) {} + + ~KernelSymbol() {} + + bool GetInfo(hsa_symbol_info32_t symbol_info, void *value); + + uint32_t kernarg_segment_size; + uint32_t kernarg_segment_alignment; + uint32_t group_segment_size; + uint32_t private_segment_size; + bool is_dynamic_callstack; + uint32_t size; + uint32_t alignment; + amd_runtime_loader_debug_info_t debug_info; + +private: + KernelSymbol(const KernelSymbol &ks); + KernelSymbol& operator=(const KernelSymbol &ks); +}; + +//===----------------------------------------------------------------------===// +// VariableSymbol. // +//===----------------------------------------------------------------------===// + +class VariableSymbol final: public SymbolImpl { +public: + VariableSymbol(const bool &_is_loaded, + const std::string &_name, + const hsa_symbol_linkage_t &_linkage, + const bool &_is_definition, + const hsa_variable_allocation_t &_allocation, + const hsa_variable_segment_t &_segment, + const uint32_t &_size, + const uint32_t &_alignment, + const bool &_is_constant, + const bool &_is_external = false, + const uint64_t &_address = 0) + : SymbolImpl(_is_loaded, + HSA_SYMBOL_KIND_VARIABLE, + _name, + _linkage, + _is_definition, + _address) + , allocation(_allocation) + , segment(_segment) + , size(_size) + , alignment(_alignment) + , is_constant(_is_constant) + , is_external(_is_external) {} + + ~VariableSymbol() {} + + bool GetInfo(hsa_symbol_info32_t symbol_info, void *value); + + hsa_variable_allocation_t allocation; + hsa_variable_segment_t segment; + uint32_t size; + uint32_t alignment; + bool is_constant; + bool is_external; + +private: + VariableSymbol(const VariableSymbol &vs); + VariableSymbol& operator=(const VariableSymbol &vs); +}; + +//===----------------------------------------------------------------------===// +// Executable. // +//===----------------------------------------------------------------------===// + +class ExecutableImpl; +class LoadedCodeObjectImpl; +class Segment; + +class ExecutableObject { +protected: + ExecutableImpl *owner; + hsa_agent_t agent; + +public: + ExecutableObject(ExecutableImpl *owner_, hsa_agent_t agent_) + : owner(owner_), agent(agent_) { } + + ExecutableImpl* Owner() const { return owner; } + hsa_agent_t Agent() const { return agent; } + virtual void Destroy() = 0; + + virtual ~ExecutableObject() { } +}; + +class LoadedCodeObjectImpl : public LoadedCodeObject, public ExecutableObject { +private: + LoadedCodeObjectImpl(const LoadedCodeObjectImpl&); + LoadedCodeObjectImpl& operator=(const LoadedCodeObjectImpl&); + + const void *elf_data; + const size_t elf_size; + std::vector loaded_segments; + +public: + LoadedCodeObjectImpl(ExecutableImpl *owner_, hsa_agent_t agent_, const void *elf_data_, size_t elf_size_) + : ExecutableObject(owner_, agent_), elf_data(elf_data_), elf_size(elf_size_) {} + + const void* ElfData() const { return elf_data; } + size_t ElfSize() const { return elf_size; } + std::vector& LoadedSegments() { return loaded_segments; } + + bool GetInfo(amd_loaded_code_object_info_t attribute, void *value) override; + + hsa_status_t IterateLoadedSegments( + hsa_status_t (*callback)( + amd_loaded_segment_t loaded_segment, + void *data), + void *data) override; + + void Destroy() override {} +}; + +class Segment : public LoadedSegment, public ExecutableObject { +private: + amdgpu_hsa_elf_segment_t segment; + void *ptr; + size_t size; + uint64_t vaddr; + bool frozen; + +public: + Segment(ExecutableImpl *owner_, hsa_agent_t agent_, amdgpu_hsa_elf_segment_t segment_, void* ptr_, size_t size_, uint64_t vaddr_) + : ExecutableObject(owner_, agent_), segment(segment_), + ptr(ptr_), size(size_), vaddr(vaddr_), frozen(false) { } + + amdgpu_hsa_elf_segment_t ElfSegment() const { return segment; } + void* Ptr() const { return ptr; } + size_t Size() const { return size; } + uint64_t VAddr() const { return vaddr; } + + bool GetInfo(amd_loaded_segment_info_t attribute, void *value) override; + + uint64_t Offset(uint64_t addr); // Offset within segment. Used together with ptr with loader context functions. + + void* Address(uint64_t addr); // Address in segment. Used for relocations and valid on agent. + + bool Freeze(); + + bool IsAddressInSegment(uint64_t addr); + void Copy(uint64_t addr, const void* src, size_t size); + void Destroy() override; +}; + +class Sampler : public ExecutableObject { +private: + hsa_ext_sampler_t samp; + +public: + Sampler(ExecutableImpl *owner, hsa_agent_t agent, hsa_ext_sampler_t samp_) + : ExecutableObject(owner, agent), samp(samp_) { } + void Destroy() override; +}; + +class Image : public ExecutableObject { +private: + hsa_ext_image_t img; + +public: + Image(ExecutableImpl *owner, hsa_agent_t agent, hsa_ext_image_t img_) + : ExecutableObject(owner, agent), img(img_) { } + void Destroy() override; +}; + +typedef std::string ProgramSymbol; +typedef std::unordered_map ProgramSymbolMap; + +typedef std::pair AgentSymbol; +struct ASC { + bool operator()(const AgentSymbol &las, const AgentSymbol &ras) const { + return las.first == ras.first && las.second.handle == ras.second.handle; + } +}; +struct ASH { + size_t operator()(const AgentSymbol &as) const { + size_t h = std::hash()(as.first); + size_t i = std::hash()(as.second.handle); + return h ^ (i << 1); + } +}; +typedef std::unordered_map AgentSymbolMap; + +class ExecutableImpl final: public Executable { +public: + const hsa_profile_t& profile() const { + return profile_; + } + const hsa_executable_state_t& state() const { + return state_; + } + + ExecutableImpl(const hsa_profile_t &_profile, Context *context, size_t id); + + ~ExecutableImpl(); + + hsa_status_t GetInfo(hsa_executable_info_t executable_info, void *value); + + hsa_status_t DefineProgramExternalVariable( + const char *name, void *address); + + hsa_status_t DefineAgentExternalVariable( + const char *name, + hsa_agent_t agent, + hsa_variable_segment_t segment, + void *address); + + hsa_status_t LoadCodeObject( + hsa_agent_t agent, + hsa_code_object_t code_object, + const char *options, + amd_loaded_code_object_t *loaded_code_object); + + hsa_status_t LoadCodeObject( + hsa_agent_t agent, + hsa_code_object_t code_object, + size_t code_object_size, + const char *options, + amd_loaded_code_object_t *loaded_code_object); + + hsa_status_t Freeze(const char *options); + + hsa_status_t Validate(uint32_t *result) { + amd::hsa::common::ReaderLockGuard reader_lock(rw_lock_); + assert(result); + *result = 0; + return HSA_STATUS_SUCCESS; + } + + Symbol* GetSymbol( + const char *module_name, + const char *symbol_name, + hsa_agent_t agent, + int32_t call_convention); + + hsa_status_t IterateSymbols( + iterate_symbols_f callback, void *data); + + hsa_status_t IterateLoadedCodeObjects( + hsa_status_t (*callback)( + amd_loaded_code_object_t loaded_code_object, + void *data), + void *data); + + Context* context() { return context_; } + size_t id() { return id_; } + +private: + ExecutableImpl(const ExecutableImpl &e); + ExecutableImpl& operator=(const ExecutableImpl &e); + + std::unique_ptr code; + + Symbol* GetSymbolInternal( + const char *module_name, + const char *symbol_name, + hsa_agent_t agent, + int32_t call_convention); + + hsa_status_t LoadSegment(hsa_agent_t agent, code::Segment* s, uint32_t majorVersion, uint16_t machine); + hsa_status_t LoadSegmentV1(hsa_agent_t agent, amd::hsa::code::Segment* seg); + hsa_status_t LoadSegmentV2(hsa_agent_t agent, amd::hsa::code::Segment* seg, uint16_t machine); + hsa_status_t LoadSymbol(hsa_agent_t agent, amd::hsa::code::Symbol* sym); + hsa_status_t LoadDefinitionSymbol(hsa_agent_t agent, amd::hsa::code::Symbol* sym); + hsa_status_t LoadDeclarationSymbol(hsa_agent_t agent, amd::hsa::code::Symbol* sym); + hsa_status_t LoadRelocationSection(hsa_agent_t agent, amd::hsa::code::RelocationSection* sec); + hsa_status_t LoadRelocation(hsa_agent_t agent, amd::hsa::code::Relocation* rel); + + uint64_t SymbolAddress(hsa_agent_t agent, amd::hsa::code::Symbol* sym); + uint64_t SymbolAddress(hsa_agent_t agent, amd::elf::Symbol* sym); + Segment* SymbolSegment(hsa_agent_t agent, amd::hsa::code::Symbol* sym); + Segment* SectionSegment(hsa_agent_t agent, amd::hsa::code::Section* sec); + + amd::hsa::common::ReaderWriterLock rw_lock_; + hsa_profile_t profile_; + Context *context_; + const size_t id_; + hsa_executable_state_t state_; + + ProgramSymbolMap program_symbols_; + AgentSymbolMap agent_symbols_; + std::vector objects; + Segment *program_allocation_segment; + std::vector loaded_code_objects; +}; + +class AmdHsaCodeLoader : public Loader { +private: + Context* context; + std::vector executables; + std::mutex executables_mutex; + +public: + AmdHsaCodeLoader(Context* context_) + : context(context_) { assert(context); } + + Context* GetContext() const { return context; } + + Executable* CreateExecutable(hsa_profile_t profile, const char *options) override; + + void DestroyExecutable(Executable *executable) override; + + hsa_status_t IterateExecutables( + hsa_status_t (*callback)( + hsa_executable_t executable, + void *data), + void *data) override; +}; + +} // namespace loader +} // namespace hsa +} // namespace amd + +#endif // HSA_RUNTIME_CORE_LOADER_EXECUTABLE_HPP_ diff --git a/runtime/hsa-runtime/loader/loaders.cpp b/runtime/hsa-runtime/loader/loaders.cpp new file mode 100644 index 0000000000..e3345db19b --- /dev/null +++ b/runtime/hsa-runtime/loader/loaders.cpp @@ -0,0 +1,234 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#include +#include +#include "loaders.hpp" + +namespace amd { +namespace hsa { +namespace loader { + + // Helper function that allocates an aligned memory. + static inline void* + alignedMalloc(size_t size, size_t alignment) + { + #if defined(_WIN32) + return ::_aligned_malloc(size, alignment); + #else + void * ptr = NULL; + alignment = (std::max)(alignment, sizeof(void*)); + if (0 == ::posix_memalign(&ptr, alignment, size)) { + return ptr; + } + return NULL; + #endif + } + + // Helper function that frees an aligned memory. + static inline void + alignedFree(void *ptr) + { + #if defined(_WIN32) + ::_aligned_free(ptr); + #else + free(ptr); + #endif + } + + OfflineLoaderContext::OfflineLoaderContext() + : out(std::cout) + { + invalid.handle = 0; + gfx700.handle = 700; + gfx701.handle = 701; + gfx800.handle = 800; + gfx801.handle = 801; + gfx802.handle = 802; + gfx803.handle = 803; + gfx804.handle = 804; + gfx810.handle = 810; + } + + hsa_isa_t OfflineLoaderContext::IsaFromName(const char *name) + { + std::string sname(name); + if (sname == "AMD:AMDGPU:7:0:0") { + return gfx700; + } else if (sname == "AMD:AMDGPU:7:0:1") { + return gfx701; + } else if (sname == "AMD:AMDGPU:8:0:0") { + return gfx800; + } else if (sname == "AMD:AMDGPU:8:0:1") { + return gfx801; + } else if (sname == "AMD:AMDGPU:8:0:2") { + return gfx802; + } else if (sname == "AMD:AMDGPU:8:0:3") { + return gfx803; + } else if (sname == "AMD:AMDGPU:8:0:4") { + return gfx804; + } else if (sname == "AMD:AMDGPU:8:1:0") { + return gfx810; + } else { + assert(0); + return invalid; + } + } + + bool OfflineLoaderContext::IsaSupportedByAgent(hsa_agent_t agent, hsa_isa_t isa) + { + return true; + } + + void* OfflineLoaderContext::SegmentAlloc(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, size_t size, size_t align, bool zero) + { + void* ptr = alignedMalloc(size, align); + if (zero) { memset(ptr, 0, size); } + out << "SegmentAlloc: " << segment << ": " << "size=" << size << " align=" << align << " zero=" << zero << " result=" << ptr << std::endl; + pointers.insert(ptr); + return ptr; + } + + bool OfflineLoaderContext::SegmentCopy(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, void* dst, size_t offset, const void* src, size_t size) + { + out << "SegmentCopy: " << segment << ": " << "dst=" << dst << " offset=" << offset << " src=" << src << " size=" << size << std::endl; + if (!dst || !src || dst == src) { + return false; + } + if (0 == size) { + return true; + } + memcpy((char *) dst + offset, src, size); + return true; + } + + void OfflineLoaderContext::SegmentFree(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, void* seg, size_t size) + { + out << "SegmentFree: " << segment << ": " << " ptr=" << seg << " size=" << size << std::endl; + pointers.erase(seg); + alignedFree(seg); + } + + void* OfflineLoaderContext::SegmentAddress(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, void* seg, size_t offset) + { + out << "SegmentAddress: " << segment << ": " << " ptr=" << seg << " offset=" << offset << std::endl; + return (char*) seg + offset; + } + + void* OfflineLoaderContext::SegmentHostAddress(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, void* seg, size_t offset) + { + out << "SegmentHostAddress: " << segment << ": " << " ptr=" << seg << " offset=" << offset << std::endl; + return (char*) seg + offset; + } + + bool OfflineLoaderContext::SegmentFreeze(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, void* seg, size_t size) + { + out << "SegmentFreeze: " << segment << ": " << " ptr=" << seg << " size=" << size << std::endl; + return true; + } + + bool OfflineLoaderContext::ImageExtensionSupported() + { + return true; + } + + hsa_status_t OfflineLoaderContext::ImageCreate( + hsa_agent_t agent, + hsa_access_permission_t image_permission, + const hsa_ext_image_descriptor_t *image_descriptor, + const void *image_data, + hsa_ext_image_t *image_handle) + { + void* ptr = alignedMalloc(256, 8); + out << "ImageCreate" << ":" << + " permission=" << image_permission << + " geometry=" << image_descriptor->geometry << + " width=" << image_descriptor->width << + " height=" << image_descriptor->height << + " depth=" << image_descriptor->depth << + " array_size=" << image_descriptor->array_size << + " channel_type=" << image_descriptor->format.channel_type << + " channel_order=" << image_descriptor->format.channel_order<< + " data=" << image_data << + std::endl; + pointers.insert(ptr); + image_handle->handle = reinterpret_cast(ptr); + return HSA_STATUS_SUCCESS; + } + + hsa_status_t OfflineLoaderContext::ImageDestroy( + hsa_agent_t agent, hsa_ext_image_t image_handle) + { + void* ptr = reinterpret_cast(image_handle.handle); + pointers.erase(ptr); + alignedFree(ptr); + return HSA_STATUS_SUCCESS; + } + + hsa_status_t OfflineLoaderContext::SamplerCreate( + hsa_agent_t agent, + const hsa_ext_sampler_descriptor_t *sampler_descriptor, + hsa_ext_sampler_t *sampler_handle) + { + void* ptr = alignedMalloc(256, 8); + out << "SamplerCreate" << ":" << + " coordinate_mode=" << sampler_descriptor->coordinate_mode << + " filter_mode=" << sampler_descriptor->filter_mode << + " address_mode=" << sampler_descriptor->address_mode << + std::endl; + pointers.insert(ptr); + sampler_handle->handle = reinterpret_cast(ptr); + return HSA_STATUS_SUCCESS; + } + + hsa_status_t OfflineLoaderContext::SamplerDestroy( + hsa_agent_t agent, hsa_ext_sampler_t sampler_handle) + { + void* ptr = reinterpret_cast(sampler_handle.handle); + pointers.erase(ptr); + alignedFree(ptr); + return HSA_STATUS_SUCCESS; + } + +} +} +} diff --git a/runtime/hsa-runtime/loader/loaders.hpp b/runtime/hsa-runtime/loader/loaders.hpp new file mode 100644 index 0000000000..9a1df578d9 --- /dev/null +++ b/runtime/hsa-runtime/loader/loaders.hpp @@ -0,0 +1,106 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LOADERS_HPP_ +#define LOADERS_HPP_ + +#include "amd_hsa_loader.hpp" +#include +#include + +namespace amd { +namespace hsa { +namespace loader { + + class OfflineLoaderContext : public amd::hsa::loader::Context { + private: + hsa_isa_t invalid; + hsa_isa_t gfx700, gfx701, gfx800, gfx801, gfx802, gfx803, gfx804, gfx810; + hsa_isa_t reserved; + std::ostream& out; + typedef std::set PointerSet; + PointerSet pointers; + + public: + OfflineLoaderContext(); + + hsa_isa_t IsaFromName(const char *name) override; + + bool IsaSupportedByAgent(hsa_agent_t agent, hsa_isa_t isa) override; + + void* SegmentAlloc(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, size_t size, size_t align, bool zero) override; + + bool SegmentCopy(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, void* dst, size_t offset, const void* src, size_t size) override; + + void SegmentFree(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, void* seg, size_t size = 0) override; + + void* SegmentAddress(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, void* seg, size_t offset) override; + + void* SegmentHostAddress(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, void* seg, size_t offset) override; + + bool SegmentFreeze(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, void* seg, size_t size) override; + + bool ImageExtensionSupported(); + + hsa_status_t ImageCreate( + hsa_agent_t agent, + hsa_access_permission_t image_permission, + const hsa_ext_image_descriptor_t *image_descriptor, + const void *image_data, + hsa_ext_image_t *image_handle); + + hsa_status_t ImageDestroy( + hsa_agent_t agent, hsa_ext_image_t image_handle); + + hsa_status_t SamplerCreate( + hsa_agent_t agent, + const hsa_ext_sampler_descriptor_t *sampler_descriptor, + hsa_ext_sampler_t *sampler_handle); + + hsa_status_t SamplerDestroy( + hsa_agent_t agent, hsa_ext_sampler_t sampler_handle); + }; +} +} +} + +#endif // LOADERS_HPP_ diff --git a/runtime/hsa-runtime/utils/sp3/LICENSE.txt b/runtime/hsa-runtime/utils/sp3/LICENSE.txt new file mode 100644 index 0000000000..548cb055df --- /dev/null +++ b/runtime/hsa-runtime/utils/sp3/LICENSE.txt @@ -0,0 +1,19 @@ +Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/runtime/hsa-runtime/utils/sp3/sp3-asic.h b/runtime/hsa-runtime/utils/sp3/sp3-asic.h new file mode 100644 index 0000000000..5696ba53c4 --- /dev/null +++ b/runtime/hsa-runtime/utils/sp3/sp3-asic.h @@ -0,0 +1,181 @@ +//===================================================================== +// Copyright 2016 (c), Advanced Micro Devices, Inc. All rights reserved. +// +/// \author AMD Developer Tools Team +/// \file +/// +//===================================================================== + +#ifndef SP3_ASIC_H +#define SP3_ASIC_H + + +#include "sp3-int.h" +#include "sp3-vm.h" + + +#ifdef __cplusplus +extern "C" { +#endif + + +// ASIC types + + +enum asic_backend { + ASIC_BACKEND_SI, + ASIC_BACKEND_CI, + ASIC_BACKEND_GFX8, + ASIC_BACKEND_GFX81, + ASIC_MAX_BACKEND, // Must be the last entry +}; + + +enum asic_cap_id { + ASIC_THREAD_SIZE = 1, + ASIC_FED_INSTRUCTIONS = 2, + ASIC_LEGACY_LOG = 3, + ASIC_LARGE_DS_READ = 4, + ASIC_32BANK_LDS = 5, +}; + + +struct asic_info { + const char *name; + enum asic_backend backend; // which backend to use + int asic_thread_size; // number of threads in a wave + int asic_fed_instructions; // FED instructions are available + int asic_legacy_log; // Legacy EXP and LOG opcodes are available + int asic_large_ds_read; // Large DS read opcodes (96b and 128b) are available + int asic_32bank_lds; // Full 32 bank lds P1LL_F16 INTERP instruction available +}; + + +struct sp3_asic_state { + struct sp3_asic_aluop { + int pos; // original position in code + int op, na, nc; // na = number of args, nc = number of consts in args + int lds, offset; // lds = is an LDS_IDX_OP subop, offset = LDS offset + unsigned dst; + unsigned arg[3]; + unsigned lit[3]; // float literals are no longer float at this point + unsigned flags; + int scalar; + } bundle [5]; + unsigned lds_lit[2], lds_mask[2]; + int nbundle; + int reorder; + int last_reorder, last_po[5]; + int nscalar; // number of nominally-scalar opcodes in bundle + int barrier_after; // require barrier after this clause + + // sp3-r6xx + int asic; + struct da_reloc { + unsigned addr, ref; + struct da_reloc *next; + } *da_relocs; + struct cf_reloc **instrels; + struct cf_reloc *labels; + int sinstrels; + int slabels; + char unk_name[16]; +}; +#define A S->ap + + +extern struct asic_info asics[]; +#define ASICNAME asics[A->asic].name +#define ASIC asics[A->asic] +void set_asic(Sp, int asic); +int find_asic(const char *name); + + +// opcode tables + +void sp3_unbuild_tables(void); +void sp3_si_unbuild_tables(void); +void sp3_ci_unbuild_tables(void); +void sp3_gfx8_unbuild_tables(void); + +void sp3_build_tables(void); +void sp3_si_build_tables(void); +void sp3_ci_build_tables(void); +void sp3_gfx8_build_tables(void); + + + + +// helper functions + + +#define FMT_FMT 0x00000000 +#define FMT_COMP 0x00010000 +#define FMT_ENDIAN 0x00020000 +#define FMT_NUM 0x00030000 +#define FMT_SRF 0x00040000 +#define FMT_MASK 0xFFFF0000 +#define FMT_IMASK 0x0000FFFF + +void mark_sgpr(Sp, unsigned); +void mark_vgpr(Sp, unsigned); +void mark_global(Sp, unsigned); +void mark_ctemp(Sp, unsigned); +int is_mod_bool(Sp, pnode *, const char *); +int get_mod_bool(Sp, pnode *, const char *); +int get_mod_int(Sp, pnode *, int, int); +int get_mod_int32(Sp, pnode *); +int par_cmask(Sp, pnode *); +unsigned reg_csel(Sp, unsigned , int); +unsigned reg_msel(Sp, unsigned *, int); + +const char *spec_sel_to_name(Sp, int sel); +const char *sp3_fmt_to_name(Sp, int cls, int val); +const char *sp3_si_fmt_to_name(Sp, int cls, int val); +const char *sp3_ci_fmt_to_name(Sp, int cls, int val); +const char *sp3_gfx8_fmt_to_name(Sp, int cls, int val); + +void add_reloc_label(Sp, int li, int blame); +void add_reloc_inst(Sp, int ii, int blame); +void add_reloc_cf(Sp, int offs); + +int grouping_for_group_size(Sp, int group_size); + +//JENNICA - this block of name_tree will go away, replace +//with backend specific. + +enum nametree_enum { + NAMETREE_OPCODES, + NAMETREE_OPCODES_0ARG, + NAMETREE_OPCODES_CALL, + NAMETREE_VTX_FMTS, + NAMETREE_SPEC_SELS, + NAMETREE_SPEC_VEC_SELS, + NAMETREE_SGPR_NAME_SELS, + NAMETREE_CONSTS, + NAMETREE_DEPRECATED, +}; + +struct name_tree **get_name_tree(struct sp3_state *S, enum nametree_enum whichtree); + +extern struct name_tree *opcodes_0arg; +extern struct name_tree *opcodes_call; +extern struct name_tree *vtx_fmts; +extern struct name_tree *spec_sels; +extern struct name_tree *spec_vec_sels; +extern struct name_tree *sgpr_name_sels; +extern struct name_tree *consts; +extern struct name_tree *deprecated; + +extern struct name_tree *asic_names; +struct asic_caps{const char *name; int id;}; +extern struct asic_caps asiccaps[]; +extern struct name_tree *asic_caps; //JENNICA - this may need to go away. + +void update_sgpr_names(Sp); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/runtime/hsa-runtime/utils/sp3/sp3-int.h b/runtime/hsa-runtime/utils/sp3/sp3-int.h new file mode 100644 index 0000000000..a66550b1b7 --- /dev/null +++ b/runtime/hsa-runtime/utils/sp3/sp3-int.h @@ -0,0 +1,553 @@ +//===================================================================== +// Copyright 2016 (c), Advanced Micro Devices, Inc. All rights reserved. +// +/// \author AMD Developer Tools Team +/// \file +/// +//===================================================================== + +#ifndef SP3_INT_H +#define SP3_INT_H + +#include "sp3.h" + + +#ifdef _MSC_VER +#ifndef strdup +#define strdup _strdup +#endif +#ifndef stricmp +#define stricmp _stricmp +#endif +#ifndef strcasecmp +#define strcasecmp _stricmp +#endif +#pragma warning(disable:4090 4204 4245 4296 4389 4701 4702) +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +struct sp3_state; +#define Sp struct sp3_state *S + +// clause types + +#define CT_NONE 0 +#define CT_SHADER 1 + +// parse tree + +#define P_NUM 0 // integer +#define P_FLT 1 // float +#define P_STR 2 // string +#define P_REG 3 // register component(s) +#define P_RANGE 4 // closed range +#define P_RANGEL 5 // right-open range +#define P_SLICE 6 // array concatenation (used for slices) +#define P_RCAST 7 // integer -> register cast +#define P_LIST 8 // list (internal to the parser only) +#define P_VAR 9 // variable (with name) +#define P_VARE 10 // variable-element (result of lvalue slice) +#define P_CL 11 // clause +#define P_CLI 12 // clause instructions +#define P_WHILE 13 // while loop +#define P_REPEAT 14 // repeat-until loop +#define P_IF 15 // if or if-else +#define P_CFOR 16 // C-style for loop +#define P_FOR 17 // vector for loop +#define P_RET 18 // return from function +#define P_CSLICE 19 // componentwise slice +#define P_UREF 20 // unresolved reference +#define P_FREF 21 // resolved reference +#define P_CALL 22 // function call +#define P_PRINT 23 // print to stdout +#define P_PAR 24 // function parameters +#define P_NF 25 // native function +#define P_OMOD 27 // opcode modifier +#define P_OMODS 28 // opcode modifiers +#define P_OPARS 29 // opcode parameters +#define P_OP 30 // opcode +#define P_SWIZ0 31 // register swizzles with N components wrapped +#define P_SWIZ1 32 // -"- +#define P_SWIZ2 33 // -"- +#define P_SWIZ3 34 // -"- +#define P_SWIZ4 35 // -"- +#define P_VTXFMT 36 // vertex formats +#define P_LABEL 37 // unique identifier of a label +#define P_LINIT 38 // generate label identifiers +#define P_MARK 39 // mark a label +#define P_OPCALL 40 // opcode that does a clause instantiation on par0 +#define P_ASIC 41 // ASIC model +#define P_ASICCAP 42 // ASIC capability +#define P_NCLOS 43 // create closure +#define P_CLOS 44 // closure +#define P_SH 45 // compiled shader + +#define P_NOT 0x100 +#define P_BNOT 0x101 +#define P_NEG 0x102 +#define P_MUL 0x103 +#define P_DIV 0x104 +#define P_MOD 0x105 +#define P_ADD 0x106 +#define P_SUB 0x107 +#define P_SHL 0x108 +#define P_SHR 0x109 +#define P_SAR 0x10A +#define P_LT 0x10B +#define P_GT 0x10C +#define P_LEQ 0x10D +#define P_GEQ 0x10E +#define P_EQ 0x10F +#define P_NEQ 0x110 +#define P_BAND 0x111 +#define P_BOR 0x112 +#define P_BXOR 0x113 +#define P_AND 0x114 +#define P_OR 0x115 +#define P_XOR 0x116 +#define P_SEL 0x117 +#define P_XDEC 0x118 +#define P_XINC 0x119 +#define P_DECX 0x11A +#define P_INCX 0x11B +#define P_ASGN 0x11C +#define P_IND 0x11D +#define P_NOP 0x11E +#define P_VSUM 0x11F +#define P_VPROD 0x120 +#define P_VBOR 0x121 +#define P_VBAND 0x122 +#define P_VBXOR 0x123 +#define P_VOR 0x124 +#define P_VAND 0x125 +#define P_VXOR 0x126 +#define P_VMIN 0x127 +#define P_VMAX 0x128 +#define P_CADD 0x129 +#define P_CSUB 0x12A +#define P_CMUL 0x12B +#define P_CDIV 0x12C +#define P_CSHL 0x12D +#define P_CSHR 0x12E +#define P_CSAR 0x12F +#define P_CBAND 0x130 +#define P_CBOR 0x131 +#define P_CBXOR 0x132 +#define P_CAND 0x133 +#define P_COR 0x134 +#define P_CXOR 0x135 +#define P_CMIN 0x136 +#define P_CMAX 0x137 +#define P_MIN 0x138 +#define P_MAX 0x139 +#define P_PROBE 0x13A +#define P_BITS 0x13B + +// register types +#define R_VGPR 0x00000 +#define R_OFF 0x04000 +#define R_SNAME 0x06000 +#define R_INTERP 0x08000 +#define R_SPEC 0x0A000 +#define R_SGPR 0x0C000 +#define R_EXPBUF 0x0E000 +#define R_TMASK 0x1E000 + +// magic values for R_SPEC +#define R_P_CL 3 // used internally only (inline literal) +#define R_P_CI_L 0xDB // used internally only +#define R_P_LDX_L 0xDB // any LDS inline +#define R_P_LDS_L 0xDF // direct LDS inline +#define R_P_LDS_H 0xE0 +#define R_P_LDX_H 0xE0 +#define R_P_CI_S 0xF3 // end of new R8xx constants +#define R_P_CI_H 0xFC +#define R_P_NOTLAST 0xFF// notlast operand for export + +// magic values for R_SNAME +#define R_S_SCRATCH 1 +#define R_S_PSVS_STATE 2 +#define R_S_SO_WRITE_INDEX 3 +#define R_S_SO_BASE_OFFSET0 4 +#define R_S_SO_BASE_OFFSET1 5 +#define R_S_SO_BASE_OFFSET2 6 +#define R_S_SO_BASE_OFFSET3 7 +#define R_S_OFFCHIP_LDS 8 +#define R_S_IS_OFFCHIP 9 +#define R_S_RING_OFFSET 10 +#define R_S_GS_WAVE_ID 11 +#define R_S_TG_SIZE 12 +#define R_S_TF_BASE 13 +#define R_S_TGID_X 14 +#define R_S_TGID_Y 15 +#define R_S_TGID_Z 16 +#define R_S_WAVE_CNT 17 +#define R_S_GLOBAL_WAVE_ID 18 + +// register components +#define R_CMASK 0x1C00 +#define R_CSHIFT 10 +#define R_CX 0x0000 +#define R_CY 0x0400 +#define R_CZ 0x0800 +#define R_CW 0x0C00 +#define R_CS 0x1000 // used to identify scalar elements +#define R_CN 0x1800 + +#define R_IMASK 0x03FF + +// source transforms +#define R_NEG 0x80000 +#define R_ABS 0x100000 +#define R_SEXT 0x200000 + +// subencodings for export targets + +#define R_E_TMASK 0x0380 +#define R_E_MRT 0x0000 +#define R_E_Z 0x0080 +#define R_E_POS 0x0100 +#define R_E_PARAM 0x0180 +#define R_E_ATTR 0x0280 +#define R_E_NULL 0x0300 + +#define R_E_IMASK 0x007F + +// subencodings for interp + +#define R_I_TMASK 0x0380 +#define R_I_P10 0x0000 +#define R_I_P20 0x0080 +#define R_I_P0 0x0100 + +// function parameters +#define F_CANY 0x00000000 +#define F_CNUM 0x01000000 +#define F_CREG 0x02000000 +#define F_CTMP 0x03000000 +#define F_CFPTR 0x04000000 +#define F_CINT 0x05000000 +#define F_CMASK 0x07000000 +#define F_OPT 0x40000000 +#define F_VEC 0x80000000 + +typedef struct pnode { + struct pnode *gc_next; + int gc_mark; + int type; + int et; // error reporting tag + int ni; // number of items + union pnode_item { + int num; // integer + float flt; // float + char *str; // string + struct pnode *ptr; // tree item + struct { + struct pnode *v; + int e; + } ve; // variable-element pair + struct { + int p; + char *n; + } var; // variable (stack offset, name) + struct sp3_shader *sh; + unsigned int reg; // register components + struct pnode *(* nf)(Sp, struct pnode **); // native function + } i[1]; +} pnode; + +pnode *p_str(Sp, char *s); // wrap a string +pnode *p_float(Sp, float f); // wrap a float +pnode *p_num(Sp, int i); // wrap an integer +pnode *p_vec(Sp, int type, int len); // create a vector +pnode *p_list(Sp, pnode *list, pnode *item); // append item to P_LIST +pnode *p_list_rev(Sp, pnode *list); // reverse the order of the list +pnode *p_tree(Sp, int type, int nitems, ...); // create a tree node +pnode *p_l2t(Sp, int type, pnode *list); // list to tree +pnode *p_l2v(Sp, int type, pnode *list); // list to vector +pnode *p_x2x(Sp, int type, pnode *p); // cast to type +pnode *p_clause(Sp, int vstk, int lstk, pnode *parlist, pnode *instlist, int type); +pnode *p_reg(Sp, int type, int idx); // wrap a register +pnode *p_swizzle(Sp, char *str); // parse a swizzle string +pnode *p_lv2rv(Sp, pnode *lval); // lvalue to rvalue +pnode *p_newlabel(Sp, pnode *t, int tag); // define new label +pnode *p_label(Sp, int cnt); // fill with label IDs +pnode *p_clone(Sp, pnode *src); + +void print_node(pnode *); // print to stdout + +void mark_gc_storage(Sp); // mark all internal storage of sp3 for gc + +// functions provided by machine driver +int is_opcode(struct sp3_state *S, const char *name); // is an opcode (any) +int is_opcode_0arg(struct sp3_state *S, const char *name); // is an opcode (0-argument) +int is_opcode_call(struct sp3_state *S, const char *name); // is a call op (1st argument is a closure) +void sp3_gen_opcode(Sp, const char *op, pnode *par, pnode *mod); +void sp3_si_gen_opcode(Sp, const char *op, pnode *par, pnode *mod); +void sp3_ci_gen_opcode(Sp, const char *op, pnode *par, pnode *mod); +void sp3_gfx8_gen_opcode(Sp, const char *op, pnode *par, pnode *mod); +pnode *machine_const(Sp, char *name); // if a machine const, parse it (else NULL) +void mark_label(Sp, int li); // "label:" +pnode *asic_getcap(Sp, int id); // get ASIC capability #id +void mach_cleanup(Sp); // initialize generator state + +// name trees + +#define NT_SEARCH 0 +#define NT_ADD 1 +#define NT_ADD_ONLY 2 +#define NT_ADD_STRDUP 4 +struct name_tree { + const char *name; + int tag; + int add; + struct name_tree *l, *r; +}; + +struct name_tree *name_tree_operation(struct name_tree **t, const char *name, int tag, int add); +void name_tree_delete(struct name_tree **t); + +// symbol table + +void f_decl(Sp, char *, pnode *); +pnode *f_ref(Sp, char *); +void f_check(Sp); +pnode *f_call(Sp, const char *); + +void f_decl_native(Sp, int, char *, pnode *(*)(Sp, pnode **), int, ...); + +// parse-time variable stack + +void vs_decl(Sp, const char *, int tag); +int vs_lookup(Sp, const char *, pnode **, int); +char *vs_getname(pnode *); + +void vs_enter_func(Sp); +int vs_leave_func(Sp, int *); // returns number of stack allocations & + // (through param) number of lstack allocs +void vs_enter_block(Sp); +void vs_leave_block(Sp); + +int vs_get_topmax(Sp); // returns number of stack allocation for top level + +// runtime variable stack + +void rv_set(Sp, pnode *, pnode *); +pnode *rv_get(Sp, pnode *); +void rv_alloc(Sp, int); +void rv_setpar(Sp, int, pnode *); +int rv_enter(Sp, int); +void rv_leave(Sp, int); + +int rl_enter(Sp, int); +void rl_leave(Sp, int); + +void rv_leave_native(Sp); +pnode **rv_getpar_native(Sp); + +// all-in-one variable setter + +void rv_set_by_name(Sp, const char *, pnode *); + +// growable binary buffer + +typedef struct grow_buf { + int n, size; + unsigned i[1]; +} grow_buf; + +grow_buf *gb_alloc(int); +grow_buf *gb_append(grow_buf *, int, unsigned *); +grow_buf *gb_add(grow_buf *, unsigned); +grow_buf *gb_reg(grow_buf *, unsigned, unsigned); + +// clause contents + +struct clause_info { + unsigned base; + grow_buf *data; + int type; +}; + +void start_clause(Sp, int); +void cb_emit(Sp, unsigned *, int); +int cb_ptr(Sp); +void cb_patch(Sp, int, int, unsigned); + +int remap_clauses(Sp); + +struct sp3_shader *gen_output(Sp); +void convert_relocs(Sp); +void perform_relocs(Sp); + +pnode *shader_clos(Sp, pnode *); // call this to get a binary shader from closure +pnode *shader_name(Sp, const char *); // call this to get a binary shader from name + +void set_const(Sp, int idx, unsigned val); +int find_const(Sp, unsigned val); + +void set_kbuf(Sp, int kbuf, int idx, unsigned val); + +const char *asic_name(Sp); +int asic_id(Sp); +int asic_capbyname(int, const char *); +int asic_capbyid(int, int); + +// register stream packer +int sp3_guess_shader_type(struct sp3_state *S, struct sp3_shader *sh); +int sp3_si_guess_shader_type(struct sp3_shader *sh); +int sp3_ci_guess_shader_type(struct sp3_shader *sh); +int sp3_gfx8_guess_shader_type(struct sp3_shader *sh); +void sp3_pack_reg_stream(Sp, int type, struct sp3_shader *sh); +void sp3_si_pack_reg_stream(Sp, int type, struct sp3_shader *sh); +void sp3_ci_pack_reg_stream(Sp, int type, struct sp3_shader *sh); +void sp3_gfx8_pack_reg_stream(Sp, int type, struct sp3_shader *sh); +void unpack_reg_stream(Sp, struct sp3_shader *sh); + +// instances + +int new_instance(Sp, pnode *, int); +void eval_instances(Sp); +int get_instance_clause(Sp, int); +int get_instance_type(Sp, int); + +// error reporting + +void et_parse_mode(Sp, int); +int et_get_id(Sp); +#ifdef _MSC_VER +__declspec(noreturn) +#endif +void et_error(Sp, char *, char *, ...) +#ifdef __GNUC__ +__attribute__ ((__noreturn__)) +__attribute__ ((format(printf, 3, 4))) +#endif +; +void et_warning(Sp, char *, char *, ...) +#ifdef __GNUC__ +__attribute__ ((format(printf, 3, 4))) +#endif +; +void et_blame(Sp, pnode *); +void et_blame_et(Sp, int); +void et_print(Sp, pnode *); +int et_get_blame(Sp); + +// text buffer for disasm +void bprintf(Sp, char *, ...) +#ifdef __GNUC__ +__attribute__ ((format(printf, 2, 3))) +#endif +; +void bcmt(Sp, const char *cmt, const char *start, const char *line, const char *end); +void btab(Sp, int); +char *bget(Sp); + +// state structure +struct sp3_state { + // flex + void *scanner; + void *yystate; + + char *yyfile; + int yyline; + + // sp3-gc + struct sp3_gc_state *gc; + + // asic private + struct sp3_asic_state *ap; + + // sp3-eval + int retflag; + pnode *retval; + + // sp3-int + struct sp3_shader config; + + int clause_id; // counts up during evaluation + int clause_type; + struct clause_info *clauses; + int nclauses, sclauses; + + int memsize, ctsizes[4]; + int in_shader; + + char *disasm_text; + int disasm_column; + int disasm_len, disasm_maxlen; + + sp3_vma *comment_map; + void *comment_ctx; + sp3_comment_cb comment_top, comment_right; + + unsigned const_buf[1024]; + int const_vld[1024], const_vld_range; + + unsigned *kval[16]; + int knum[16]; + + struct et_record { + const char *file; + int line; + } *et_names; + int et_node; + int et_parsing; + int net_names, set_names; + + char *fname_last; + struct name_tree *fnames; + struct fsym { + char *name; + pnode *func; + struct fref *refs; + struct fsym *l, *r; + } *fsymbols; + int func_id; // counts up during parsing + + struct instance { + int type; + int clause_id; + pnode *call; + } *instances; + int ninstances, sinstances; + + struct vstack { + char *name; + int tag; + int vs_sp, vs_level; + struct vstack *next; + } *var_stack, *lbl_stack; + int vs_max, vs_sp, vs_top, vs_topmax; + int ls_max, ls_sp; + + pnode **rl_stack; + int rl_sp, rl_ss, rl_base, rl_id, rl_size; + + pnode **rv_stack; + int rv_sp, rv_ss, rv_base, rv_size; + + int werror, wcount; + const char *err_hdr; + + unsigned entry_point_table_size; + unsigned entry_point_table_alloc_size; + sp3_vmaddr *entry_point_table; +}; +struct sp3_state *sp3_new_state(void); +void sp3_asic_attach_state(Sp); +void sp3_new_parser(Sp); +void sp3_free_parser(Sp); +void sp3_free_state(Sp); + +void reg_natives(Sp); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/runtime/hsa-runtime/utils/sp3/sp3-type.h b/runtime/hsa-runtime/utils/sp3/sp3-type.h new file mode 100644 index 0000000000..160dc945ed --- /dev/null +++ b/runtime/hsa-runtime/utils/sp3/sp3-type.h @@ -0,0 +1,137 @@ +//===================================================================== +// Copyright 2016 (c), Advanced Micro Devices, Inc. All rights reserved. +// +/// \author AMD Developer Tools Team +/// \file +/// +//===================================================================== + +#ifndef SP3_TYPE_H +#define SP3_TYPE_H + +#ifdef __cplusplus +extern "C" { +#endif + +/// @file sp3-type.h +/// @brief sp3 types + +enum sp3_shtype { + SP3_SHTYPE_NONE = -1, + SP3_SHTYPE_PS = 0, + SP3_SHTYPE_VS = 1, + SP3_SHTYPE_GS = 2, + SP3_SHTYPE_ES = 3, + SP3_SHTYPE_HS = 4, + SP3_SHTYPE_LS = 5, + SP3_SHTYPE_CS = 6, +}; + +enum sp3_count { + SP3_NUM_MRT = 8, + SP3_NUM_STRM = 4, +}; + +enum sp3_flag { + SP3DIS_NO_STATE = 0x01, + SP3DIS_NO_BINARY = 0x02, + SP3DIS_COMMENTS = 0x04, + SP3DIS_NO_GPR_COUNT = 0x08, + SP3DIS_FORCEVALID = 0x10, + SP3DIS_NO_ASIC = 0x20, +}; + +/// @brief Shader context. Contains no user-visible fields. +struct sp3_context; + +/// @brief Storage entry for register streams. +struct sp3_reg { + unsigned index; ///< One of the mm* values from chip_enum.h. + unsigned value; +}; + +/// @brief Wrapped shader metadata. +/// +/// After generation, shaders are encapsulated in sp3_shader structures. +/// +/// Those structures contain the shader binary, its register stream, +/// constants and constant buffers and metadata needed for SC compatibility. +struct sp3_shader { + int type; ///< One of the SHTYPE_* constants. + int asic_int; ///< Internal ASIC index. Do not use. + const char *asic; ///< ASIC name as a string ("RV870" etc). + unsigned size; ///< Size of the compiled shader, in 32-bit words. + unsigned nsgprs; ///< Number of scalar GPRs used. + unsigned nvgprs; ///< Number of vector GPRs used. + unsigned trap_present; + unsigned user_sgpr_count; + unsigned scratch_en; + unsigned dispatch_draw_en; + unsigned so_en; + unsigned so_base0_en; + unsigned so_base1_en; + unsigned so_base2_en; + unsigned so_base3_en; + unsigned oc_lds_en; + unsigned tg_size_en; + unsigned tidig_comp_cnt; ///< Number of components(-1) enabled for thread id in group + unsigned tgid_x_en; + unsigned tgid_y_en; + unsigned tgid_z_en; + unsigned wave_cnt_en; + unsigned sgpr_scratch; + unsigned sgpr_psvs_state; + unsigned sgpr_so_write_index; + unsigned sgpr_so_base_offset0; + unsigned sgpr_so_base_offset1; + unsigned sgpr_so_base_offset2; + unsigned sgpr_so_base_offset3; + unsigned sgpr_offchip_lds; + unsigned sgpr_is_offchip; + unsigned sgpr_ring_offset; + unsigned sgpr_gs_wave_id; + unsigned sgpr_global_wave_id; + unsigned sgpr_tg_size; + unsigned sgpr_tgid_x; + unsigned sgpr_tgid_y; + unsigned sgpr_tgid_z; + unsigned sgpr_tf_base; + unsigned sgpr_wave_cnt; + unsigned pc_exports; ///< Range of parameters exported (if VS). + unsigned pos_export; ///< Shader executes a position export (if VS). + unsigned cb_exports; ///< Range of MRTs exported (if PS). + unsigned mrtz_export_format; ///< Export format of the mrtz export. + unsigned z_export; ///< Shader executes a Z export (if PS). + unsigned pops_en; ///< Shader is POPS (PS) + unsigned load_collision_waveid; ///< Shader sets load collision waveid (if PS). + unsigned stencil_test_export; ///< Shader exports stencil (if PS). + unsigned stencil_op_export; ///< Shader exports stencil (if PS). + unsigned kill_used; ///< Shader executes ALU KILL operations. + unsigned cb_masks[SP3_NUM_MRT]; ///< Component masks for each MRT exported (if PS). + unsigned emit_used; ///< EMIT opcodes used (if GS). + unsigned covmask_export; ///< Shader exports coverage mask (if PS). + unsigned mask_export; ///< Shader exports mask (if PS). + unsigned strm_used[SP3_NUM_STRM]; ///< Streamout operations used (map). + unsigned scratch_used; ///< Scratch SMX exports used. + unsigned scratch_itemsize; ///< Scratch ring item size. + unsigned reduction_used; ///< Reduction SMX exports used. + unsigned ring_used; ///< ESGS/GSVS ring SMX exports used. + unsigned ring_itemsize; ///< ESGS/GSVS ring item size (for ES/GS respectively). + unsigned vertex_size[4]; ///< GSVS ring vertex size (for GS). + unsigned mem_used; ///< Raw memory SMX exports used. + unsigned rats_used; ///< Mask of RATs (UAVs) used + unsigned group_size[3]; ///< Wavefront group size (for ELF files). + unsigned alloc_lds; ///< Number of LDS bytes allocated for wave group. (translates to lds_size in CS and LS) + unsigned *data; ///< Shader binary data. + unsigned nregs; ///< Number of register writes in the stream. + struct sp3_reg *regs; ///< Register writes (index-value pairs). +}; + +/// @brief Comment callback. +typedef const char *(*sp3_comment_cb)(void *, int); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/runtime/hsa-runtime/utils/sp3/sp3-vm.h b/runtime/hsa-runtime/utils/sp3/sp3-vm.h new file mode 100644 index 0000000000..15c1baeb3c --- /dev/null +++ b/runtime/hsa-runtime/utils/sp3/sp3-vm.h @@ -0,0 +1,119 @@ +//===================================================================== +// Copyright 2016 (c), Advanced Micro Devices, Inc. All rights reserved. +// +/// \author AMD Developer Tools Team +/// \file +/// +//===================================================================== + +#ifndef SP3_VM_H +#define SP3_VM_H + +#ifdef __cplusplus +extern "C" { +#endif + +#if defined (WIN_OS) && !defined(SP3_STATIC_LIB) + #if defined(DLL_EXPORT_SP3) + #define SP3_EXPORT __declspec(dllexport) + #else + #define SP3_EXPORT __declspec(dllimport) + #endif +#else + #define SP3_EXPORT +#endif + +#ifdef _MSC_VER +typedef __int32 int32_t; +typedef unsigned __int32 uint32_t; + +typedef __int64 int64_t; +typedef unsigned __int64 uint64_t; +#else +#include +#endif + +struct sp3_vma; + +/// @file sp3-vm.h +/// @brief sp3 VM API +/// +/// The VM API is used to manage virtual memory maps. Those maps are +/// used for binary storage for disassembly, as they can naturally +/// mirror the GPU's memory map (so no register translation is needed). + +#define SP3_VM_PAGESIZE 64 + +/// @brief VM addresses are 64-bit and the address unit is 32 bits +/// +typedef uint64_t sp3_vmaddr; + +/// @brief Callback function that will fill a VMA on demand +/// +/// The VMA to be filled will be specified through the request address. +/// The callback should fill the VMA using sp3_vm_write calls. +typedef void (* sp3_vmfill)(struct sp3_vma *vm, sp3_vmaddr addr, void *ctx); + +/// @brief VM area +/// +/// VMAs are kept in a sorted list +typedef struct sp3_vma { + sp3_vmaddr base, len; + sp3_vmfill fill; + void *fill_ctx; + uint32_t *data; + struct sp3_vma *prev, *next; +} sp3_vma; + +/// @brief Create a new VM that is empty. +/// +SP3_EXPORT +sp3_vma *sp3_vm_new(void); + +/// @brief Create a new VM that has a sp3_vmfill callback. +/// +SP3_EXPORT +sp3_vma *sp3_vm_new_fill(sp3_vmfill fill, void *ctx); + +/// @brief Create a new VM from an array of words. +/// @param base VM address to load array at. +/// @param len Number of 32-bit words in the array. +/// @param data Pointer to the array. +/// +SP3_EXPORT +sp3_vma *sp3_vm_new_ptr(sp3_vmaddr base, sp3_vmaddr len, const uint32_t *data); + +/// @brief Find a VMA, optionally adding it. +/// @param vm VM to search in. +/// @param addr Address to search for. +/// @param add Flag indicating whether a failure should result in adding a new VMA. +/// +SP3_EXPORT +sp3_vma *sp3_vm_find(sp3_vma *vm, sp3_vmaddr addr, int add); + +/// @brief Write a word to a VM. +/// +SP3_EXPORT +void sp3_vm_write(sp3_vma *vm, sp3_vmaddr addr, uint32_t val); + +/// @brief Read a word from a VM. +/// +SP3_EXPORT +uint32_t sp3_vm_read(sp3_vma *vm, sp3_vmaddr addr); + +/// @brief Probe VM for presence. +/// @return 1 if the specified address is backed in the VM, 0 otherwise. +/// +SP3_EXPORT +int sp3_vm_present(sp3_vma *vm, sp3_vmaddr addr); + +/// @brief Free a VM and all its storage. +/// +SP3_EXPORT +void sp3_vm_free(sp3_vma *vm); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/runtime/hsa-runtime/utils/sp3/sp3.h b/runtime/hsa-runtime/utils/sp3/sp3.h new file mode 100644 index 0000000000..7ecc8e67a4 --- /dev/null +++ b/runtime/hsa-runtime/utils/sp3/sp3.h @@ -0,0 +1,198 @@ +//===================================================================== +// Copyright 2016 (c), Advanced Micro Devices, Inc. All rights reserved. +// +/// \author AMD Developer Tools Team +/// \file +/// +//===================================================================== + +#ifndef SP3_H +#define SP3_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include "sp3-vm.h" +#include "sp3-type.h" + +/// @file sp3.h +/// @brief sp3 API + +/// @brief Get version of the sp3 library. +/// +/// @return String containing the version number. +/// +SP3_EXPORT const char *sp3_version(void); + +/// @brief Create a new sp3 context. +/// +SP3_EXPORT struct sp3_context *sp3_new(void); + +/// @brief Set option for sp3. +/// +/// @param state sp3 context. +/// @param option Option name. Unknown options will raise an error. +/// @param value Option value. NULL is used to represent value-less options. +/// +SP3_EXPORT void sp3_set_option(struct sp3_context *state, const char *option, const char *value); + +/// @brief Parse a file into a context. +/// +/// If 'file' is NULL, parse stdin. +/// +SP3_EXPORT void sp3_parse_file(struct sp3_context *state, const char *file); + +/// @brief Parse a string into a context. +/// +SP3_EXPORT void sp3_parse_string(struct sp3_context *state, const char *string); + +/// @brief Parse a file from the standard library into a context. +/// +SP3_EXPORT void sp3_parse_library(struct sp3_context *state, const char *name); + +/// @brief Call a sp3 function. +/// +SP3_EXPORT void sp3_call(struct sp3_context *state, const char *func); + +/// @brief Call a sp3 CF clause. +/// +/// @param state sp3 context. +/// @param cffunc Name of clause to call. By convention, this is "main". +/// +/// @return A compiled and linked shader. Free memory with sp3_free(). +/// +SP3_EXPORT struct sp3_shader *sp3_compile(struct sp3_context *state, const char *cffunc); + +/// @brief Free a sp3_shader. +/// +SP3_EXPORT void sp3_free_shader(struct sp3_shader *sh); + +/// @brief Get current ASIC name set for a context. +/// +SP3_EXPORT const char *sp3_getasic(struct sp3_context *state); + +/// @brief Set current ASIC name for a context. +/// +SP3_EXPORT void sp3_setasic(struct sp3_context *state, const char *chip); + +/// @brief Set global variable in context to an integer. +/// +SP3_EXPORT void sp3_set_param_int(struct sp3_context *state, const char *name, int value); + +/// @brief Set global variable in context to an integer vector. +/// +SP3_EXPORT void sp3_set_param_intvec(struct sp3_context *state, const char *name, int size, const int *value); + +/// @brief Set global variable in context to a float. +/// +SP3_EXPORT void sp3_set_param_float(struct sp3_context *state, const char *name, float value); + +/// @brief Set global variable in context to a float vector. +/// +SP3_EXPORT void sp3_set_param_floatvec(struct sp3_context *state, const char *name, int size, const float *value); + +/// @brief Set error message header. +/// +SP3_EXPORT void sp3_set_error_header(struct sp3_context *state, const char *str); + +/// @brief Get ASIC metrics for the ASIC in current state. +/// +/// Used by ELF tools to fill in some CAL fields. +/// +SP3_EXPORT int sp3_asicinfo(struct sp3_context *state, const char *name); + +/// @brief Free a context allocated by sp3_new/open/parse. +/// +SP3_EXPORT void sp3_close(struct sp3_context *state); + +/// @brief Disassemble a shader. +/// +/// This call is likely to change to something that will take a filled sp3_shader structure later on. +/// +/// @param state sp3 context (use sp3_new to allocate and sp3_setasic to set ASIC). +/// @param bin Memory map with the opcodes (see sp3-vm.h). +/// @param base Start of the shader in the memory map (in VM entries, i.e. 32-bit words). +/// @param name Same to give the disassembled shader. +/// @param shader_type One of the SHTYPE_* constants. +/// @param include Literal text to include in the CF clause (NULL includes nothing). +/// @param max_len Maximum length of CF clause. Matters if SP3DIS_FORCEVALID is set. +/// @param flags A mask of SP3DIS_* flags. +/// +/// @return Shader disassembly as a string (allocated with malloc()). Free memory with sp3_free(). +/// +SP3_EXPORT char *sp3_disasm(struct sp3_context *state, sp3_vma *bin, sp3_vmaddr base, const char *name, int shader_type, const char *include, unsigned max_len, unsigned flags); + +/// @brief Disassemble a single shader instruction. +/// +/// This call is likely to change to something that will take a filled sp3_shader structure later on. +/// +/// @param state sp3 context (use sp3_new to allocate and sp3_setasic to set ASIC). +/// @param inst Pointer to dwords containing instruction (exact number of dwords required depends on instruction). +/// @param base Start of the shader in the memory map (in VM entries, i.e. 32-bit words). +/// @param addr Address of the instruction being disassembled (in VM entries, i.e. 32-bit words). +/// @param shader_type One of the SHTYPE_* constants. +/// @param flags A mask of SP3DIS_* flags. +/// +/// @return Shader disassembly as a string (allocated with malloc()). Free memory with sp3_free(). +/// +SP3_EXPORT char *sp3_disasm_inst(struct sp3_context *state, const unsigned inst[2], sp3_vmaddr base, sp3_vmaddr addr, int shader_type, unsigned flags); + +/// @brief Parse a register stream. +/// +/// Can be called before sp3_disasm to preset things like ALU, boolean and loop constants. +/// +/// This call is likely to merge with sp3_disasm later on. +/// +/// @param state sp3 context to fill with state. +/// @param nregs Number of register entries. +/// @param regs Register stream to parse. +/// @param shader_type One of the SHTYPE_* constants. +/// +SP3_EXPORT void sp3_setregs(struct sp3_context *state, unsigned nregs, const struct sp3_reg *regs, int shader_type); + + +/// @brief Set shader comments +/// +/// @param state sp3 context. +/// @param map Map of comments (0 for no comment, other values will be passed to the callback). +/// @param f_top Callback returning comment to place above the opcode. +/// @param f_right Callback returning comment to place to the right of the opcode. +/// @param ctx Void pointer to pass to comment callbacks. +/// +SP3_EXPORT void sp3_setcomments(struct sp3_context *state, sp3_vma *map, sp3_comment_cb f_top, sp3_comment_cb f_right, void *ctx); + +/// @brief Set alternate shader entry points +/// +/// Used for disassembly; this marks an additional location in memory +/// (besides the start address) where shader code may be found. Generally +/// required for jump tables and any case where the shader may perform +/// indirect jumps to ensure that disassembly locates all shader +/// instructions. +/// +/// @param state sp3 context (use sp3_new to allocate and sp3_setasic to set ASIC). +/// @param addr Address of the instruction being disassembled (in VM entries, i.e. 32-bit words). +/// +SP3_EXPORT void sp3_setentrypoint(struct sp3_context *state, sp3_vmaddr addr); + +/// @brief Clear alternate shader entry points +/// +/// Clear all entry points previously set with sp3_setentrypoint. +/// +/// @param state sp3 context (use sp3_new to allocate and sp3_setasic to set ASIC). +/// +SP3_EXPORT void sp3_clearentrypoints(struct sp3_context *state); + +/// @brief Free memory allocated by sp3. +/// +/// Windows DLLs that allocate memory have to free it. This function +/// should be used to free the result of sp3_disasm, sp3_compile etc. +/// +SP3_EXPORT void sp3_free(void *ptr); + +#ifdef __cplusplus +} +#endif + + +#endif