From 90ab72cd66136f22fc2edab5f2d613c50bb5a245 Mon Sep 17 00:00:00 2001 From: Jay Cornwall Date: Wed, 27 Apr 2016 16:48:32 -0500 Subject: [PATCH] Implement optimized blit/fill kernels Replace HSAIL kernels with SP3 shaders. Support all alignment variations efficiently. Change-Id: Icf7f5471f3ba68389f55484d82f2805dd9bc3827 --- .../hsa-runtime/core/inc/amd_blit_kernel.h | 69 +- .../hsa-runtime/core/inc/amd_blit_kernel_kv.h | 479 ---------- .../hsa-runtime/core/inc/amd_blit_kernel_vi.h | 490 ---------- runtime/hsa-runtime/core/inc/amd_blit_sdma.h | 4 +- runtime/hsa-runtime/core/inc/amd_gpu_agent.h | 22 +- runtime/hsa-runtime/core/inc/blit.h | 4 +- .../core/runtime/amd_blit_kernel.cpp | 835 ++++++++++++------ .../core/runtime/amd_blit_sdma.cpp | 8 +- .../core/runtime/amd_gpu_agent.cpp | 73 +- 9 files changed, 718 insertions(+), 1266 deletions(-) delete mode 100644 runtime/hsa-runtime/core/inc/amd_blit_kernel_kv.h delete mode 100644 runtime/hsa-runtime/core/inc/amd_blit_kernel_vi.h diff --git a/runtime/hsa-runtime/core/inc/amd_blit_kernel.h b/runtime/hsa-runtime/core/inc/amd_blit_kernel.h index a7b0a58f83..73b9ef81cf 100644 --- a/runtime/hsa-runtime/core/inc/amd_blit_kernel.h +++ b/runtime/hsa-runtime/core/inc/amd_blit_kernel.h @@ -43,6 +43,7 @@ #ifndef HSA_RUNTIME_CORE_INC_AMD_BLIT_KERNEL_H_ #define HSA_RUNTIME_CORE_INC_AMD_BLIT_KERNEL_H_ +#include #include #include "core/inc/blit.h" @@ -66,8 +67,10 @@ class BlitKernel : public core::Blit { /// /// @note: The call will block until all AQL packets have been executed. /// + /// @param agent Agent passed to Initialize. + /// /// @return hsa_status_t - virtual hsa_status_t Destroy() override; + virtual hsa_status_t Destroy(const core::Agent& agent) override; /// @brief Submit an AQL packet to perform vector copy. The call is blocking /// until the command execution is finished. @@ -104,17 +107,36 @@ class BlitKernel : public core::Blit { private: union KernelArgs { - struct __ALIGNED__(16) KernelCopyArgs { - const void* src; - void* dst; - uint64_t size; - uint32_t use_vector; - } copy; + struct __ALIGNED__(16) { + uint64_t phase1_src_start; + uint64_t phase1_dst_start; + uint64_t phase2_src_start; + uint64_t phase2_dst_start; + uint64_t phase3_src_start; + uint64_t phase3_dst_start; + uint64_t phase4_src_start; + uint64_t phase4_dst_start; + uint64_t phase4_src_end; + uint64_t phase4_dst_end; + uint32_t num_workitems; + } copy_aligned; - struct __ALIGNED__(16) KernelFillArgs { - void* ptr; - uint64_t num; - uint32_t value; + struct __ALIGNED__(16) { + uint64_t phase1_src_start; + uint64_t phase1_dst_start; + uint64_t phase2_src_start; + uint64_t phase2_dst_start; + uint64_t phase2_src_end; + uint64_t phase2_dst_end; + uint32_t num_workitems; + } copy_misaligned; + + struct __ALIGNED__(16) { + uint64_t phase1_dst_start; + uint64_t phase2_dst_start; + uint64_t phase2_dst_end; + uint32_t fill_value; + uint32_t num_workitems; } fill; }; @@ -136,14 +158,19 @@ class BlitKernel : public core::Blit { KernelArgs* ObtainAsyncKernelCopyArg(); - /// Handles to the vector copy kernel. - uint64_t copy_code_handle_; + /// AQL code object and size for each kernel. + enum class KernelType { + CopyAligned, + CopyMisaligned, + Fill, + }; - /// Handles to the vector copy aligned kernel. - uint64_t copy_aligned_code_handle_; + struct KernelCode { + void* code_buf_; + size_t code_buf_size_; + }; - /// Handles to the fill memory kernel. - uint64_t fill_code_handle_; + std::map kernels_; /// AQL queue for submitting the vector copy kernel. hsa_queue_t* queue_; @@ -163,12 +190,8 @@ class BlitKernel : public core::Blit { /// Lock to synchronize access to kernarg_ and completion_signal_ std::mutex lock_; - /// Pointer to memory containing the ISA and argument buffer. - void* code_arg_buffer_; - - static const size_t kMaxCopyCount; - static const size_t kMaxFillCount; - static const uint32_t kGroupSize; + /// Number of CUs on the underlying agent. + int num_cus_; }; } // namespace amd diff --git a/runtime/hsa-runtime/core/inc/amd_blit_kernel_kv.h b/runtime/hsa-runtime/core/inc/amd_blit_kernel_kv.h deleted file mode 100644 index a8e235ea7b..0000000000 --- a/runtime/hsa-runtime/core/inc/amd_blit_kernel_kv.h +++ /dev/null @@ -1,479 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// -// The University of Illinois/NCSA -// Open Source License (NCSA) -// -// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. -// -// Developed by: -// -// AMD Research and AMD HSA Software Development -// -// Advanced Micro Devices, Inc. -// -// www.amd.com -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to -// deal with the Software without restriction, including without limitation -// the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the -// Software is furnished to do so, subject to the following conditions: -// -// - Redistributions of source code must retain the above copyright notice, -// this list of conditions and the following disclaimers. -// - Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimers in -// the documentation and/or other materials provided with the distribution. -// - Neither the names of Advanced Micro Devices, Inc, -// nor the names of its contributors may be used to endorse or promote -// products derived from this Software without specific prior written -// permission. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR -// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -// DEALINGS WITH THE SOFTWARE. -// -//////////////////////////////////////////////////////////////////////////////// - -#ifndef HSA_RUNTIME_CORE_INC_AMD_BLIT_KERNEL_KV_H_ -#define HSA_RUNTIME_CORE_INC_AMD_BLIT_KERNEL_KV_H_ - -#include - -#define HSA_VECTOR_COPY_KV_AKC_SIZE 368 -#define HSA_VECTOR_COPY_KV_AKC_OFFSET 256 - -/*****HSAIL code of the ISA in ::kVectorCopyRawKv. -module &m:1:0:$full:$large:$default; - -prog kernel &__vector_copy_kernel( - kernarg_u64 %src, - kernarg_u64 %dst, - kernarg_u64 %size) -{ - @__vector_copy_kernel_entry: - // BB#0: // %entry - workitemabsid_u32 $s0, 0; - cvt_u64_u32 $d0, $s0; - ld_kernarg_align(8)_width(all)_u64 $d1, [%size]; - cmp_ge_b1_u64 $c0, $d0, $d1; - cbr_b1 $c0, @BB0_2; - // BB#1: // %if.end - ld_kernarg_align(8)_width(all)_u64 $d1, [%src]; - ld_kernarg_align(8)_width(all)_u64 $d2, [%dst]; - add_u64 $d2, $d2, $d0; - add_u64 $d0, $d1, $d0; - ld_global_u8 $s0, [$d0]; - st_global_u8 $s0, [$d2]; - - @BB0_2: - // %return - ret; -}; -*/ - -static char kVectorCopyRawKv[] = { - 127, 69, 76, 70, 2, 1, 1, 64, 0, 0, 0, 0, 0, - 0, 0, 0, 1, 0, -32, 0, 1, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 64, 0, 0, 0, 0, 0, 0, - 0, -104, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 64, 0, 56, 0, 1, 0, 64, 0, 6, 0, 5, 0, 3, - 0, 0, 96, 6, 0, 0, 0, 0, 1, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 112, 1, 0, 0, 0, 0, 0, 0, - 112, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, - 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, - 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 65, 0, -116, 0, -112, 0, 0, 0, - 11, 0, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 24, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 11, 0, 5, 0, 5, 0, 0, 0, 9, 0, 0, - 0, 0, 0, 0, 0, 3, 0, 0, 6, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 1, 5, 0, -64, 127, 0, -116, -65, - 0, -1, -128, -109, 0, 0, 16, 0, 0, 8, 0, -109, 0, - 0, 0, 74, 4, 7, 64, -64, -128, 2, 2, 126, 127, 0, - -116, -65, 0, 0, -56, 125, 106, 36, -128, -66, 15, 0, -120, - -65, 0, 7, -126, -64, 127, 0, -116, -65, 4, 0, 2, 74, - 5, 2, 4, 126, 2, 106, 80, -46, 2, 1, -87, 1, 0, - 0, 32, -36, 1, 0, 0, 1, 6, 0, 6, 74, 7, 2, - 4, 126, 4, 106, 80, -46, 2, 1, -87, 1, 112, 0, -116, - -65, 0, 0, 96, -36, 3, 1, 0, 0, 0, 0, -127, -65, - 3, 0, 0, 0, 8, 0, 0, 0, 1, 0, 0, 0, 65, - 77, 68, 0, 1, 0, 0, 0, 0, 0, 0, 0, 3, 0, - 0, 0, 12, 0, 0, 0, 2, 0, 0, 0, 65, 77, 68, - 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, - 3, 0, 0, 0, 28, 0, 0, 0, 3, 0, 0, 0, 65, - 77, 68, 0, 4, 0, 7, 0, 7, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 65, 77, 68, 0, 65, 77, 68, - 71, 80, 85, 0, 0, 3, 0, 0, 0, 40, 0, 0, 0, - 4, 0, 0, 0, 65, 77, 68, 0, 26, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 65, 77, 68, 32, 72, 83, - 65, 32, 82, 117, 110, 116, 105, 109, 101, 32, 70, 105, 110, - 97, 108, 105, 122, 101, 114, 0, 0, 0, 38, 95, 95, 118, - 101, 99, 116, 111, 114, 95, 99, 111, 112, 121, 95, 107, 101, - 114, 110, 101, 108, 0, 95, 95, 104, 115, 97, 95, 115, 101, - 99, 116, 105, 111, 110, 46, 104, 115, 97, 116, 101, 120, 116, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 26, 0, 1, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 22, 0, 0, 0, 3, 0, 1, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 46, 104, 115, 97, 116, 101, 120, 116, 0, 46, 110, - 111, 116, 101, 0, 46, 115, 116, 114, 116, 97, 98, 0, 46, - 115, 121, 109, 116, 97, 98, 0, 46, 115, 104, 115, 116, 114, - 116, 97, 98, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, - 1, 0, 0, 0, 7, 0, -64, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, - 0, 0, 112, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 10, 0, 0, 0, 7, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 112, 2, 0, 0, 0, 0, 0, - 0, -104, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 16, 0, 0, 0, 3, 0, - 0, 0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 8, 3, 0, 0, 0, 0, 0, 0, - 44, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 24, 0, 0, 0, 2, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 56, 3, 0, 0, 0, 0, 0, 0, 48, - 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, - 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 24, 0, 0, - 0, 0, 0, 0, 0, 32, 0, 0, 0, 3, 0, 0, 0, - 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 104, 3, 0, 0, 0, 0, 0, 0, 42, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, -}; -extern char* const kVectorCopyKvObject = &kVectorCopyRawKv[0]; -extern size_t const kVectorCopyKvObjectSize = sizeof(kVectorCopyRawKv); - -#define HSA_VECTOR_COPY_ALIGNED_KV_AKC_SIZE 436 -#define HSA_VECTOR_COPY_ALIGNED_KV_AKC_OFFSET 256 - -/*****HSAIL code of the ISA in ::kVectorCopyAlignedRawKv. -module &m:1:0:$full:$large:$default; -extension "amd:gcn"; - -prog kernel &__copy_buffer_aligned_kernel( - kernarg_u64 %src, - kernarg_u64 %dst, - kernarg_u64 %size, - kernarg_u32 %use_vector) -{ - @__copy_buffer_aligned_kernel_entry: - // BB#0: // %entry - workitemabsid_u32 $s0, 0; - cvt_u64_u32 $d0, $s0; - ld_kernarg_align(8)_width(all)_u64 $d1, [%size]; - cmp_ge_b1_u64 $c0, $d0, $d1; - cbr_b1 $c0, @LBB0_4; - // BB#1: // %if.end - ld_kernarg_align(8)_width(all)_u64 $d2, [%dst]; - ld_kernarg_align(8)_width(all)_u64 $d1, [%src]; - ld_kernarg_align(4)_width(all)_u32 $s0, [%use_vector]; - cmp_ne_b1_s32 $c0, $s0, 1; - cbr_b1 $c0, @LBB0_3; - // BB#2: // %if.then2 - shl_u64 $d0, $d0, 4; - add_u64 $d2, $d2, $d0; - add_u64 $d0, $d1, $d0; - ld_v4_global_align(16)_const_u32 ($s0, $s1, $s2, $s3), [$d0]; - st_v4_global_align(16)_u32 ($s0, $s1, $s2, $s3), [$d2]; - br @LBB0_4; - - @LBB0_3: - // %if.else - shl_u64 $d0, $d0, 2; - add_u64 $d2, $d2, $d0; - add_u64 $d0, $d1, $d0; - ld_global_align(4)_const_u32 $s0, [$d0]; - st_global_align(4)_u32 $s0, [$d2]; - - @LBB0_4: - // %if.end6 - ret; -}; -*/ - -static char kVectorCopyAlignedRawKv[] = { - 127, 69, 76, 70, 2, 1, 1, 64, 0, 0, 0, 0, 0, - 0, 0, 0, 1, 0, -32, 0, 1, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 64, 0, 0, 0, 0, 0, 0, - 0, -8, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 64, 0, 56, 0, 1, 0, 64, 0, 6, 0, 5, 0, 3, - 0, 0, 96, 6, 0, 0, 0, 0, 1, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, -76, 1, 0, 0, 0, 0, 0, 0, - -76, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, - 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, - 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 65, 0, -84, 0, -112, 0, 0, 0, - 11, 0, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 11, 0, 7, 0, 7, 0, 0, 0, 9, 0, 0, - 0, 0, 0, 0, 0, 4, 4, 4, 6, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 1, 5, 0, -64, 127, 0, -116, -65, - 0, -1, -128, -109, 0, 0, 16, 0, 0, 8, 0, -109, 0, - 0, 0, 74, 4, 7, 64, -64, -128, 2, 2, 126, 127, 0, - -116, -65, 0, 0, -56, 125, 106, 36, -128, -66, 32, 0, -120, - -65, 6, 7, 1, -64, 0, 7, -126, -64, 127, 0, -116, -65, - 2, -127, 0, -65, 14, 0, -124, -65, 0, 0, -62, -46, 0, - 9, 1, 0, 4, 0, 4, 74, 5, 2, 6, 126, 3, 3, - 6, 80, 0, 0, 56, -36, 2, 0, 0, 2, 6, 0, 0, - 74, 7, 2, 12, 126, 6, 3, 2, 80, 112, 0, -116, -65, - 0, 0, 120, -36, 0, 2, 0, 0, 13, 0, -126, -65, 0, - 0, -62, -46, 0, 5, 1, 0, 4, 0, 4, 74, 5, 2, - 6, 126, 3, 3, 6, 80, 0, 0, 48, -36, 2, 0, 0, - 2, 6, 0, 0, 74, 7, 2, 6, 126, 3, 3, 2, 80, - 112, 0, -116, -65, 0, 0, 112, -36, 0, 2, 0, 0, 0, - 0, -127, -65, 0, 0, 0, 0, 4, 0, 0, 0, 8, 0, - 0, 0, 1, 0, 0, 0, 65, 77, 68, 0, 1, 0, 0, - 0, 0, 0, 0, 0, 4, 0, 0, 0, 12, 0, 0, 0, - 2, 0, 0, 0, 65, 77, 68, 0, 1, 0, 0, 0, 0, - 0, 0, 0, 1, 1, 1, 0, 4, 0, 0, 0, 25, 0, - 0, 0, 5, 0, 0, 0, 65, 77, 68, 0, 22, 0, 45, - 104, 115, 97, 95, 99, 97, 108, 108, 95, 99, 111, 110, 118, - 101, 110, 116, 105, 111, 110, 61, 0, 0, 0, 0, 0, 4, - 0, 0, 0, 30, 0, 0, 0, 3, 0, 0, 0, 65, 77, - 68, 0, 4, 0, 7, 0, 7, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 65, 77, 68, 0, 65, 77, 68, 71, - 80, 85, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 8, - 0, 0, 0, 4, 0, 0, 0, 65, 77, 68, 0, -32, 101, - -118, -12, -1, 127, 0, 0, 38, 95, 95, 99, 111, 112, 121, - 95, 98, 117, 102, 102, 101, 114, 95, 97, 108, 105, 103, 110, - 101, 100, 95, 107, 101, 114, 110, 101, 108, 0, 95, 95, 104, - 115, 97, 95, 115, 101, 99, 116, 105, 111, 110, 46, 104, 115, - 97, 116, 101, 120, 116, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 26, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, - -76, 1, 0, 0, 0, 0, 0, 0, 30, 0, 0, 0, 3, - 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 46, 104, 115, 97, 116, 101, - 120, 116, 0, 46, 110, 111, 116, 101, 0, 46, 115, 116, 114, - 116, 97, 98, 0, 46, 115, 121, 109, 116, 97, 98, 0, 46, - 115, 104, 115, 116, 114, 116, 97, 98, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 1, 0, 0, 0, 1, 0, 0, 0, 7, 0, -64, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 1, 0, 0, 0, 0, 0, 0, -76, 1, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 10, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -72, 2, - 0, 0, 0, 0, 0, 0, -88, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16, - 0, 0, 0, 3, 0, 0, 0, 32, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 96, 3, 0, - 0, 0, 0, 0, 0, 52, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 24, 0, - 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, -104, 3, 0, 0, - 0, 0, 0, 0, 48, 0, 0, 0, 0, 0, 0, 0, 3, - 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, - 0, 0, 24, 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, - 0, 3, 0, 0, 0, 32, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, -56, 3, 0, 0, 0, - 0, 0, 0, 42, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, -}; -extern char* const kVectorCopyAlignedKvObject = &kVectorCopyAlignedRawKv[0]; -extern size_t const kVectorCopyAlignedKvObjectSize = - sizeof(kVectorCopyAlignedRawKv); - -#define HSA_FILL_MEMORY_KV_AKC_SIZE 352 -#define HSA_FILL_MEMORY_KV_AKC_OFFSET 256 - -/*****HSAIL code of the ISA in ::kFillMemoryRawKv. -module &m:1:0:$full:$large:$default; -extension "amd:gcn"; - -prog kernel &__fill_memory_kernel( -kernarg_u64 %ptr, -kernarg_u64 %num, -kernarg_u32 %value) -{ -@__fill_memory_kernel_entry: -// BB#0: // %entry -workitemabsid_u32 $s0, 0; -cvt_u64_u32 $d0, $s0; -ld_kernarg_align(8)_width(all)_u64 $d1, [%num]; -cmp_ge_b1_u64 $c0, $d0, $d1; -cbr_b1 $c0, @LBB0_2; -// BB#1: // %if.end -ld_kernarg_align(8)_width(all)_u64 $d1, [%ptr]; -ld_kernarg_align(4)_width(all)_u32 $s0, [%value]; -shl_u64 $d0, $d0, 2; -add_u64 $d0, $d1, $d0; -st_global_align(4)_u32 $s0, [$d0]; - -@LBB0_2: -// %return -ret; -}; -*/ - -static char kFillMemoryRawKv[] = { - 127, 69, 76, 70, 2, 1, 1, 64, 0, 0, 0, 0, 0, 0, - 0, 0, 1, 0, -32, 0, 1, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 64, 0, 0, 0, 0, 0, 0, 0, -104, 3, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 64, 0, 56, 0, - 1, 0, 64, 0, 6, 0, 5, 0, 3, 0, 0, 96, 6, 0, - 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 96, 1, - 0, 0, 0, 0, 0, 0, 96, 1, 0, 0, 0, 0, 0, 0, - 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, - 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 64, 0, -84, 0, - -112, 0, 0, 0, 11, 0, 10, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 32, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 11, 0, 3, 0, 3, 0, 0, 0, 9, 0, - 0, 0, 0, 0, 0, 0, 4, 4, 4, 6, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 1, 5, 0, -64, 127, 0, - -116, -65, 0, -1, -128, -109, 0, 0, 16, 0, 0, 8, 0, -109, - 0, 0, 0, 74, 2, 7, 64, -64, -128, 2, 2, 126, 127, 0, - -116, -65, 0, 0, -56, 125, 106, 36, -128, -66, 11, 0, -120, -65, - 0, 7, 65, -64, 4, 7, 2, -64, 0, 0, -62, -46, 0, 5, - 1, 0, 127, 0, -116, -65, 2, 0, 0, 74, 3, 2, 4, 126, - 2, 3, 2, 80, 4, 2, 4, 126, 0, 0, 112, -36, 0, 2, - 0, 0, 0, 0, -127, -65, 4, 0, 0, 0, 8, 0, 0, 0, - 1, 0, 0, 0, 65, 77, 68, 0, 1, 0, 0, 0, 0, 0, - 0, 0, 4, 0, 0, 0, 12, 0, 0, 0, 2, 0, 0, 0, - 65, 77, 68, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, - 1, 0, 4, 0, 0, 0, 25, 0, 0, 0, 5, 0, 0, 0, - 65, 77, 68, 0, 22, 0, 45, 104, 115, 97, 95, 99, 97, 108, - 108, 95, 99, 111, 110, 118, 101, 110, 116, 105, 111, 110, 61, 0, - 0, 0, 0, 0, 4, 0, 0, 0, 30, 0, 0, 0, 3, 0, - 0, 0, 65, 77, 68, 0, 4, 0, 7, 0, 7, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 65, 77, 68, 0, 65, 77, - 68, 71, 80, 85, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, - 8, 0, 0, 0, 4, 0, 0, 0, 65, 77, 68, 0, 48, 123, - 44, -103, -4, 127, 0, 0, 38, 95, 95, 102, 105, 108, 108, 95, - 109, 101, 109, 111, 114, 121, 95, 107, 101, 114, 110, 101, 108, 0, - 95, 95, 104, 115, 97, 95, 115, 101, 99, 116, 105, 111, 110, 46, - 104, 115, 97, 116, 101, 120, 116, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 26, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 96, 1, 0, 0, 0, 0, 0, 0, 22, 0, 0, 0, 3, 0, - 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 46, 104, 115, 97, 116, 101, 120, 116, 0, - 46, 110, 111, 116, 101, 0, 46, 115, 116, 114, 116, 97, 98, 0, - 46, 115, 121, 109, 116, 97, 98, 0, 46, 115, 104, 115, 116, 114, - 116, 97, 98, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 7, 0, - -64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 1, 0, 0, 0, 0, 0, 0, 96, 1, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 0, - 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 96, 2, 0, 0, 0, 0, - 0, 0, -88, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 16, 0, 0, 0, 3, 0, 0, 0, - 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 8, 3, 0, 0, 0, 0, 0, 0, 44, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 24, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 56, 3, 0, 0, - 0, 0, 0, 0, 48, 0, 0, 0, 0, 0, 0, 0, 3, 0, - 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, - 24, 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 0, 3, 0, - 0, 0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 104, 3, 0, 0, 0, 0, 0, 0, 42, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, -}; - -extern char* const kFillMemoryKvObject = &kFillMemoryRawKv[0]; -extern size_t const kFillMemoryKvObjectSize = sizeof(kFillMemoryRawKv); -#endif // header guard \ No newline at end of file diff --git a/runtime/hsa-runtime/core/inc/amd_blit_kernel_vi.h b/runtime/hsa-runtime/core/inc/amd_blit_kernel_vi.h deleted file mode 100644 index 13969370ba..0000000000 --- a/runtime/hsa-runtime/core/inc/amd_blit_kernel_vi.h +++ /dev/null @@ -1,490 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// -// The University of Illinois/NCSA -// Open Source License (NCSA) -// -// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. -// -// Developed by: -// -// AMD Research and AMD HSA Software Development -// -// Advanced Micro Devices, Inc. -// -// www.amd.com -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to -// deal with the Software without restriction, including without limitation -// the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the -// Software is furnished to do so, subject to the following conditions: -// -// - Redistributions of source code must retain the above copyright notice, -// this list of conditions and the following disclaimers. -// - Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimers in -// the documentation and/or other materials provided with the distribution. -// - Neither the names of Advanced Micro Devices, Inc, -// nor the names of its contributors may be used to endorse or promote -// products derived from this Software without specific prior written -// permission. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR -// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -// DEALINGS WITH THE SOFTWARE. -// -//////////////////////////////////////////////////////////////////////////////// - -#ifndef HSA_RUNTIME_CORE_INC_AMD_BLIT_KERNEL_VI_H_ -#define HSA_RUNTIME_CORE_INC_AMD_BLIT_KERNEL_VI_H_ - -#include - -#define HSA_VECTOR_COPY_VI_AKC_SIZE 380 -#define HSA_VECTOR_COPY_VI_AKC_OFFSET 256 - -/*****HSAIL code of the ISA in ::kVectorCopyRawVi. -module &m:1:0:$full:$large:$default; - -prog kernel &__vector_copy_kernel( - kernarg_u64 %src, - kernarg_u64 %dst, - kernarg_u64 %size) -{ - @__vector_copy_kernel_entry: - // BB#0: // %entry - workitemabsid_u32 $s0, 0; - cvt_u64_u32 $d0, $s0; - ld_kernarg_align(8)_width(all)_u64 $d1, [%size]; - cmp_ge_b1_u64 $c0, $d0, $d1; - cbr_b1 $c0, @BB0_2; - // BB#1: // %if.end - ld_kernarg_align(8)_width(all)_u64 $d1, [%src]; - ld_kernarg_align(8)_width(all)_u64 $d2, [%dst]; - add_u64 $d2, $d2, $d0; - add_u64 $d0, $d1, $d0; - ld_global_u8 $s0, [$d0]; - st_global_u8 $s0, [$d2]; - - @BB0_2: - // %return - ret; -}; -*/ - -static char kVectorCopyRawVi[] = { - 127, 69, 76, 70, 2, 1, 1, 64, 0, 0, 0, 0, 0, - 0, 0, 0, 1, 0, -32, 0, 1, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 64, 0, 0, 0, 0, 0, 0, - 0, -72, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 64, 0, 56, 0, 1, 0, 64, 0, 6, 0, 5, 0, 3, - 0, 0, 96, 6, 0, 0, 0, 0, 1, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 124, 1, 0, 0, 0, 0, 0, 0, - 124, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, - 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, - 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, -63, 2, -84, 0, -112, 0, 0, 0, - 11, 0, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 96, 0, 5, 0, 5, 0, 0, 0, 9, 0, 0, - 0, 0, 0, 0, 0, 4, 4, 4, 6, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 2, 0, 2, -64, 4, 0, 0, 0, - 127, 0, -116, -65, 0, -1, -128, -110, 0, 0, 16, 0, 0, - 8, 0, -110, 0, 0, 0, 50, 3, 0, 6, -64, 16, 0, - 0, 0, -128, 2, 2, 126, 127, 0, -116, -65, 0, 0, -40, - 125, 106, 32, -128, -66, 16, 0, -120, -65, 3, 1, 10, -64, - 0, 0, 0, 0, 127, 0, -116, -65, 4, 0, 2, 50, 5, - 2, 4, 126, 2, 106, 28, -47, 2, 1, -87, 1, 0, 0, - 64, -36, 1, 0, 0, 1, 6, 0, 6, 50, 7, 2, 4, - 126, 4, 106, 28, -47, 2, 1, -87, 1, 112, 0, -116, -65, - 0, 0, 96, -36, 3, 1, 0, 0, 0, 0, -127, -65, 0, - 0, 0, 0, 4, 0, 0, 0, 8, 0, 0, 0, 1, 0, - 0, 0, 65, 77, 68, 0, 1, 0, 0, 0, 0, 0, 0, - 0, 4, 0, 0, 0, 12, 0, 0, 0, 2, 0, 0, 0, - 65, 77, 68, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, - 1, 1, 0, 4, 0, 0, 0, 25, 0, 0, 0, 5, 0, - 0, 0, 65, 77, 68, 0, 22, 0, 45, 104, 115, 97, 95, - 99, 97, 108, 108, 95, 99, 111, 110, 118, 101, 110, 116, 105, - 111, 110, 61, 0, 0, 0, 0, 0, 4, 0, 0, 0, 30, - 0, 0, 0, 3, 0, 0, 0, 65, 77, 68, 0, 4, 0, - 7, 0, 8, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, - 0, 65, 77, 68, 0, 65, 77, 68, 71, 80, 85, 0, 0, - 0, 0, 0, 0, 4, 0, 0, 0, 8, 0, 0, 0, 4, - 0, 0, 0, 65, 77, 68, 0, 32, 103, -72, 81, -3, 127, - 0, 0, 38, 95, 95, 118, 101, 99, 116, 111, 114, 95, 99, - 111, 112, 121, 95, 107, 101, 114, 110, 101, 108, 0, 95, 95, - 104, 115, 97, 95, 115, 101, 99, 116, 105, 111, 110, 46, 104, - 115, 97, 116, 101, 120, 116, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 26, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 124, 1, 0, 0, 0, 0, 0, 0, 22, 0, 0, 0, - 3, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 46, 104, 115, 97, 116, - 101, 120, 116, 0, 46, 110, 111, 116, 101, 0, 46, 115, 116, - 114, 116, 97, 98, 0, 46, 115, 121, 109, 116, 97, 98, 0, - 46, 115, 104, 115, 116, 114, 116, 97, 98, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 7, 0, -64, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 1, 0, 0, 0, 0, 0, 0, 124, 1, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 10, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -128, - 2, 0, 0, 0, 0, 0, 0, -88, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 16, 0, 0, 0, 3, 0, 0, 0, 32, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 40, 3, - 0, 0, 0, 0, 0, 0, 44, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 24, - 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 88, 3, 0, - 0, 0, 0, 0, 0, 48, 0, 0, 0, 0, 0, 0, 0, - 3, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, - 0, 0, 0, 24, 0, 0, 0, 0, 0, 0, 0, 32, 0, - 0, 0, 3, 0, 0, 0, 32, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, -120, 3, 0, 0, - 0, 0, 0, 0, 42, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -}; -extern char* const kVectorCopyViObject = &kVectorCopyRawVi[0]; -extern size_t const kVectorCopyViObjectSize = sizeof(kVectorCopyRawVi); - -#define HSA_VECTOR_COPY_ALIGNED_VI_AKC_SIZE 452 -#define HSA_VECTOR_COPY_ALIGNED_VI_AKC_OFFSET 256 - -/*****HSAIL code of the ISA in ::kVectorCopyAlignedRawVi. -module &m:1:0:$full:$large:$default; -extension "amd:gcn"; - -prog kernel &__copy_buffer_aligned_kernel( - kernarg_u64 %src, - kernarg_u64 %dst, - kernarg_u64 %size, - kernarg_u32 %use_vector) -{ - @__copy_buffer_aligned_kernel_entry: - // BB#0: // %entry - workitemabsid_u32 $s0, 0; - cvt_u64_u32 $d0, $s0; - ld_kernarg_align(8)_width(all)_u64 $d1, [%size]; - cmp_ge_b1_u64 $c0, $d0, $d1; - cbr_b1 $c0, @LBB0_4; - // BB#1: // %if.end - ld_kernarg_align(8)_width(all)_u64 $d2, [%dst]; - ld_kernarg_align(8)_width(all)_u64 $d1, [%src]; - ld_kernarg_align(4)_width(all)_u32 $s0, [%use_vector]; - cmp_ne_b1_s32 $c0, $s0, 1; - cbr_b1 $c0, @LBB0_3; - // BB#2: // %if.then2 - shl_u64 $d0, $d0, 4; - add_u64 $d2, $d2, $d0; - add_u64 $d0, $d1, $d0; - ld_v4_global_align(16)_const_u32 ($s0, $s1, $s2, $s3), [$d0]; - st_v4_global_align(16)_u32 ($s0, $s1, $s2, $s3), [$d2]; - br @LBB0_4; - - @LBB0_3: - // %if.else - shl_u64 $d0, $d0, 2; - add_u64 $d2, $d2, $d0; - add_u64 $d0, $d1, $d0; - ld_global_align(4)_const_u32 $s0, [$d0]; - st_global_align(4)_u32 $s0, [$d2]; - - @LBB0_4: - // %if.end6 - ret; -}; -*/ - -static char kVectorCopyAlignedRawVi[] = { - 127, 69, 76, 70, 2, 1, 1, 64, 0, 0, 0, 0, 0, - 0, 0, 0, 1, 0, -32, 0, 1, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 64, 0, 0, 0, 0, 0, 0, - 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 64, 0, 56, 0, 1, 0, 64, 0, 6, 0, 5, 0, 3, - 0, 0, 96, 6, 0, 0, 0, 0, 1, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, -60, 1, 0, 0, 0, 0, 0, 0, - -60, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, - 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, - 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 65, 0, -84, 0, -112, 0, 0, 0, - 11, 0, 74, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 16, 0, 8, 0, 8, 0, 0, 0, 12, 0, 0, - 0, 0, 0, 0, 0, 4, 4, 4, 6, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 2, 0, 2, -64, 4, 0, 0, 0, - 127, 0, -116, -65, 0, -1, -128, -110, 0, 0, 16, 0, 0, - 8, 0, -110, 0, 0, 0, 50, 3, 0, 6, -64, 16, 0, - 0, 0, -128, 2, 2, 126, 127, 0, -116, -65, 0, 0, -40, - 125, 106, 32, -128, -66, 34, 0, -120, -65, -125, 0, 2, -64, - 24, 0, 0, 0, 3, 2, 10, -64, 0, 0, 0, 0, 127, - 0, -116, -65, 2, -127, 0, -65, 14, 0, -124, -65, 0, 0, - -113, -46, -124, 0, 2, 0, 8, 0, 4, 50, 9, 2, 6, - 126, 3, 3, 6, 56, 0, 0, 92, -36, 2, 0, 0, 4, - 10, 0, 0, 50, 11, 2, 4, 126, 2, 3, 2, 56, 112, - 0, -116, -65, 0, 0, 124, -36, 0, 4, 0, 0, 13, 0, - -126, -65, 0, 0, -113, -46, -126, 0, 2, 0, 8, 0, 4, - 50, 9, 2, 6, 126, 3, 3, 6, 56, 0, 0, 80, -36, - 2, 0, 0, 4, 10, 0, 0, 50, 11, 2, 4, 126, 2, - 3, 2, 56, 112, 0, -116, -65, 0, 0, 112, -36, 0, 4, - 0, 0, 0, 0, -127, -65, 0, 0, 0, 0, 4, 0, 0, - 0, 8, 0, 0, 0, 1, 0, 0, 0, 65, 77, 68, 0, - 1, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 12, - 0, 0, 0, 2, 0, 0, 0, 65, 77, 68, 0, 1, 0, - 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 4, 0, 0, - 0, 25, 0, 0, 0, 5, 0, 0, 0, 65, 77, 68, 0, - 22, 0, 45, 104, 115, 97, 95, 99, 97, 108, 108, 95, 99, - 111, 110, 118, 101, 110, 116, 105, 111, 110, 61, 0, 0, 0, - 0, 0, 4, 0, 0, 0, 30, 0, 0, 0, 3, 0, 0, - 0, 65, 77, 68, 0, 4, 0, 7, 0, 8, 0, 0, 0, - 0, 0, 0, 0, 1, 0, 0, 0, 65, 77, 68, 0, 65, - 77, 68, 71, 80, 85, 0, 0, 0, 0, 0, 0, 4, 0, - 0, 0, 8, 0, 0, 0, 4, 0, 0, 0, 65, 77, 68, - 0, 96, 62, -27, 85, -1, 127, 0, 0, 38, 95, 95, 99, - 111, 112, 121, 95, 98, 117, 102, 102, 101, 114, 95, 97, 108, - 105, 103, 110, 101, 100, 95, 107, 101, 114, 110, 101, 108, 0, - 95, 95, 104, 115, 97, 95, 115, 101, 99, 116, 105, 111, 110, - 46, 104, 115, 97, 116, 101, 120, 116, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 26, 0, 1, 0, 0, 0, 0, 0, 0, - 0, 0, 0, -60, 1, 0, 0, 0, 0, 0, 0, 30, 0, - 0, 0, 3, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 46, 104, 115, - 97, 116, 101, 120, 116, 0, 46, 110, 111, 116, 101, 0, 46, - 115, 116, 114, 116, 97, 98, 0, 46, 115, 121, 109, 116, 97, - 98, 0, 46, 115, 104, 115, 116, 114, 116, 97, 98, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 7, - 0, -64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, -60, 1, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 10, 0, 0, 0, 7, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, -56, 2, 0, 0, 0, 0, 0, 0, -88, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 16, 0, 0, 0, 3, 0, 0, 0, 32, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 112, 3, 0, 0, 0, 0, 0, 0, 52, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 24, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -88, - 3, 0, 0, 0, 0, 0, 0, 48, 0, 0, 0, 0, 0, - 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, - 0, 0, 0, 0, 0, 24, 0, 0, 0, 0, 0, 0, 0, - 32, 0, 0, 0, 3, 0, 0, 0, 32, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -40, 3, - 0, 0, 0, 0, 0, 0, 42, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -}; -extern char* const kVectorCopyAlignedViObject = &kVectorCopyAlignedRawVi[0]; -extern size_t const kVectorCopyAlignedViObjectSize = - sizeof(kVectorCopyAlignedRawVi); - -#define HSA_FILL_MEMORY_VI_AKC_SIZE 368 -#define HSA_FILL_MEMORY_VI_AKC_OFFSET 256 - -/*****HSAIL code of the ISA in ::kFillMemoryRawVi. -module &m:1:0:$full:$large:$default; -extension "amd:gcn"; - -prog kernel &__fill_memory_kernel( - kernarg_u64 %ptr, - kernarg_u64 %num, - kernarg_u32 %value) -{ - @__fill_memory_kernel_entry: - // BB#0: // %entry - workitemabsid_u32 $s0, 0; - cvt_u64_u32 $d0, $s0; - ld_kernarg_align(8)_width(all)_u64 $d1, [%num]; - cmp_ge_b1_u64 $c0, $d0, $d1; - cbr_b1 $c0, @LBB0_2; - // BB#1: // %if.end - ld_kernarg_align(8)_width(all)_u64 $d1, [%ptr]; - ld_kernarg_align(4)_width(all)_u32 $s0, [%value]; - shl_u64 $d0, $d0, 2; - add_u64 $d0, $d1, $d0; - st_global_align(4)_u32 $s0, [$d0]; - - @LBB0_2: - // %return - ret; -}; -*/ - -static char kFillMemoryRawVi[] = { - 127, 69, 76, 70, 2, 1, 1, 64, 0, 0, 0, 0, 0, - 0, 0, 0, 1, 0, -32, 0, 1, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 64, 0, 0, 0, 0, 0, 0, - 0, -88, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 64, 0, 56, 0, 1, 0, 64, 0, 6, 0, 5, 0, 3, - 0, 0, 96, 6, 0, 0, 0, 0, 1, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 112, 1, 0, 0, 0, 0, 0, 0, - 112, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, - 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, - 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 64, 0, -84, 0, -112, 0, 0, 0, - 11, 0, 74, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 13, 0, 3, 0, 3, 0, 0, 0, 9, 0, 0, - 0, 0, 0, 0, 0, 4, 4, 4, 6, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 2, 0, 2, -64, 4, 0, 0, 0, - 127, 0, -116, -65, 0, -1, -128, -110, 0, 0, 16, 0, 0, - 8, 0, -110, 0, 0, 0, 50, 3, 0, 6, -64, 8, 0, - 0, 0, -128, 2, 2, 126, 127, 0, -116, -65, 0, 0, -40, - 125, 106, 32, -128, -66, 13, 0, -120, -65, -125, 0, 6, -64, - 0, 0, 0, 0, 3, 1, 2, -64, 16, 0, 0, 0, 0, - 0, -113, -46, -126, 0, 2, 0, 127, 0, -116, -65, 2, 0, - 0, 50, 3, 2, 4, 126, 2, 3, 2, 56, 4, 2, 4, - 126, 0, 0, 112, -36, 0, 2, 0, 0, 0, 0, -127, -65, - 4, 0, 0, 0, 8, 0, 0, 0, 1, 0, 0, 0, 65, - 77, 68, 0, 1, 0, 0, 0, 0, 0, 0, 0, 4, 0, - 0, 0, 12, 0, 0, 0, 2, 0, 0, 0, 65, 77, 68, - 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, - 4, 0, 0, 0, 25, 0, 0, 0, 5, 0, 0, 0, 65, - 77, 68, 0, 22, 0, 45, 104, 115, 97, 95, 99, 97, 108, - 108, 95, 99, 111, 110, 118, 101, 110, 116, 105, 111, 110, 61, - 0, 0, 0, 0, 0, 4, 0, 0, 0, 30, 0, 0, 0, - 3, 0, 0, 0, 65, 77, 68, 0, 4, 0, 7, 0, 8, - 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 65, 77, - 68, 0, 65, 77, 68, 71, 80, 85, 0, 0, 0, 0, 0, - 0, 4, 0, 0, 0, 8, 0, 0, 0, 4, 0, 0, 0, - 65, 77, 68, 0, 16, -20, 88, 97, -4, 127, 0, 0, 38, - 95, 95, 102, 105, 108, 108, 95, 109, 101, 109, 111, 114, 121, - 95, 107, 101, 114, 110, 101, 108, 0, 95, 95, 104, 115, 97, - 95, 115, 101, 99, 116, 105, 111, 110, 46, 104, 115, 97, 116, - 101, 120, 116, 0, 0, 0, 0, 0, 0, 0, 0, 0, 26, - 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 112, 1, - 0, 0, 0, 0, 0, 0, 22, 0, 0, 0, 3, 0, 1, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 46, 104, 115, 97, 116, 101, 120, 116, - 0, 46, 110, 111, 116, 101, 0, 46, 115, 116, 114, 116, 97, - 98, 0, 46, 115, 121, 109, 116, 97, 98, 0, 46, 115, 104, - 115, 116, 114, 116, 97, 98, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, - 0, 0, 0, 1, 0, 0, 0, 7, 0, -64, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, - 0, 0, 0, 0, 0, 112, 1, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 0, - 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 112, 2, 0, 0, - 0, 0, 0, 0, -88, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16, 0, 0, - 0, 3, 0, 0, 0, 32, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 24, 3, 0, 0, 0, - 0, 0, 0, 44, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 24, 0, 0, 0, - 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 72, 3, 0, 0, 0, 0, - 0, 0, 48, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, - 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, - 24, 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 0, 3, - 0, 0, 0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 120, 3, 0, 0, 0, 0, 0, - 0, 42, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, -}; - -extern char* const kFillMemoryViObject = &kFillMemoryRawVi[0]; -extern size_t const kFillMemoryViObjectSize = sizeof(kFillMemoryRawVi); -#endif // header guard \ No newline at end of file diff --git a/runtime/hsa-runtime/core/inc/amd_blit_sdma.h b/runtime/hsa-runtime/core/inc/amd_blit_sdma.h index 35f683bc36..56aac90f96 100644 --- a/runtime/hsa-runtime/core/inc/amd_blit_sdma.h +++ b/runtime/hsa-runtime/core/inc/amd_blit_sdma.h @@ -73,8 +73,10 @@ class BlitSdma : public core::Blit { /// /// @note: The call will block until all packets have executed. /// + /// @param agent Agent passed to Initialize. + /// /// @return hsa_status_t - virtual hsa_status_t Destroy() override; + virtual hsa_status_t Destroy(const core::Agent& agent) override; /// @brief Submit a linear copy command to the queue buffer. /// diff --git a/runtime/hsa-runtime/core/inc/amd_gpu_agent.h b/runtime/hsa-runtime/core/inc/amd_gpu_agent.h index abd854679f..fb8eb0ecec 100644 --- a/runtime/hsa-runtime/core/inc/amd_gpu_agent.h +++ b/runtime/hsa-runtime/core/inc/amd_gpu_agent.h @@ -165,20 +165,24 @@ class GpuAgent : public GpuAgentInt { uint16_t GetMicrocodeVersion() const; - // @brief Assembles SP3 shader source into executable code. + // @brief Assembles SP3 shader source into ISA or AQL code object. // // @param [in] src_sp3 SP3 shader source text representation. // @param [in] func_name Name of the SP3 function to assemble. - // @param [out] code_buf Executable code buffer. - // @param [out] code_buf_size Size of executable code buffer in bytes. - void AssembleShader(const char* src_sp3, const char* func_name, - void*& code_buf, size_t& code_buf_size); + // @param [in] assemble_target ISA or AQL assembly target. + // @param [out] code_buf Code object buffer. + // @param [out] code_buf_size Size of code object buffer in bytes. + enum class AssembleTarget { ISA, AQL }; - // @brief Frees executable code created by AssembleShader. + void AssembleShader(const char* src_sp3, const char* func_name, + AssembleTarget assemble_target, void*& code_buf, + size_t& code_buf_size) const; + + // @brief Frees code object created by AssembleShader. // - // @param [in] code_buf Executable code buffer. - // @param [in] code_buf_size Size of executable code buffer in bytes. - void ReleaseShader(void* code_buf, size_t code_buf_size); + // @param [in] code_buf Code object buffer. + // @param [in] code_buf_size Size of code object buffer in bytes. + void ReleaseShader(void* code_buf, size_t code_buf_size) const; // @brief Override from core::Agent. hsa_status_t VisitRegion(bool include_peer, diff --git a/runtime/hsa-runtime/core/inc/blit.h b/runtime/hsa-runtime/core/inc/blit.h index f44a6bab1e..cf1e6da653 100644 --- a/runtime/hsa-runtime/core/inc/blit.h +++ b/runtime/hsa-runtime/core/inc/blit.h @@ -66,8 +66,10 @@ class Blit { /// /// @note: The call will block until all commands have executed. /// + /// @param agent Agent passed to Initialize. + /// /// @return hsa_status_t - virtual hsa_status_t Destroy() = 0; + virtual hsa_status_t Destroy(const core::Agent& agent) = 0; /// @brief Submit a linear copy command to the the underlying compute device's /// control block. The call is blocking until the command execution is diff --git a/runtime/hsa-runtime/core/runtime/amd_blit_kernel.cpp b/runtime/hsa-runtime/core/runtime/amd_blit_kernel.cpp index a05aef5369..18bf0bcecb 100644 --- a/runtime/hsa-runtime/core/runtime/amd_blit_kernel.cpp +++ b/runtime/hsa-runtime/core/runtime/amd_blit_kernel.cpp @@ -43,40 +43,486 @@ #include "core/inc/amd_blit_kernel.h" #include -#include -#include -#include +#include +#include -#if defined(_WIN32) || defined(_WIN64) -#define NOMINMAX -#include -#else -#include -#endif - -#include "core/inc/amd_blit_kernel_kv.h" -#include "core/inc/amd_blit_kernel_vi.h" #include "core/inc/amd_gpu_agent.h" #include "core/inc/hsa_internal.h" #include "core/util/utils.h" namespace amd { -const uint32_t BlitKernel::kGroupSize = 256; -const size_t BlitKernel::kMaxCopyCount = AlignDown(UINT32_MAX, kGroupSize); -const size_t BlitKernel::kMaxFillCount = AlignDown(UINT32_MAX, kGroupSize); - static const uint16_t kInvalidPacketHeader = HSA_PACKET_TYPE_INVALID; +static std::string kBlitKernelSource(R"( + // Compatibility function for GFXIP 7. + + function s_load_dword_offset(byte_offset) + if kGFXIPVersion == 7 + return byte_offset / 4 + else + return byte_offset + end + end + + // Memory copy for all cases except: + // (src_addr & 0x3) != (dst_addr & 0x3) + // + // Kernel argument buffer: + // [DW 0, 1] Phase 1 src start address + // [DW 2, 3] Phase 1 dst start address + // [DW 4, 5] Phase 2 src start address + // [DW 6, 7] Phase 2 dst start address + // [DW 8, 9] Phase 3 src start address + // [DW 10,11] Phase 3 dst start address + // [DW 12,13] Phase 4 src start address + // [DW 14,15] Phase 4 dst start address + // [DW 16,17] Phase 4 src end address + // [DW 18,19] Phase 4 dst end address + // [DW 20 ] Total number of workitems + + var kCopyAlignedVecWidth = 4 + var kCopyAlignedUnroll = 1 + + shader CopyAligned + type(CS) + user_sgpr_count(2) + sgpr_count(32) + vgpr_count(8 + (kCopyAlignedUnroll * kCopyAlignedVecWidth)) + + // Retrieve kernel arguments. + s_load_dwordx4 s[4:7], s[0:1], s_load_dword_offset(0x0) + s_load_dwordx4 s[8:11], s[0:1], s_load_dword_offset(0x10) + s_load_dwordx4 s[12:15], s[0:1], s_load_dword_offset(0x20) + s_load_dwordx4 s[16:19], s[0:1], s_load_dword_offset(0x30) + s_load_dwordx4 s[20:23], s[0:1], s_load_dword_offset(0x40) + s_load_dword s24, s[0:1], s_load_dword_offset(0x50) + s_waitcnt lgkmcnt(0) + + // Compute workitem id. + s_lshl_b32 s2, s2, 0x6 + v_add_u32 v0, vcc, s2, v0 + + // ===================================================== + // Phase 1: Byte copy up to 0x100 destination alignment. + // ===================================================== + + // Compute phase source address. + v_mov_b32 v3, s5 + v_add_u32 v2, vcc, v0, s4 + v_addc_u32 v3, vcc, v3, 0x0, vcc + + // Compute phase destination address. + v_mov_b32 v5, s7 + v_add_u32 v4, vcc, v0, s6 + v_addc_u32 v5, vcc, v5, 0x0, vcc + + L_COPY_ALIGNED_PHASE_1_LOOP: + // Mask off lanes (or branch out) after phase end. + v_cmp_lt_u64 vcc, v[2:3], s[8:9] + s_cbranch_vccz L_COPY_ALIGNED_PHASE_1_DONE + s_and_b64 exec, exec, vcc + + // Load from/advance the source address. + flat_load_ubyte v1, v[2:3] + s_waitcnt vmcnt(0) + v_add_u32 v2, vcc, v2, s24 + v_addc_u32 v3, vcc, v3, 0x0, vcc + + // Write to/advance the destination address. + flat_store_byte v[4:5], v1 + v_add_u32 v4, vcc, v4, s24 + v_addc_u32 v5, vcc, v5, 0x0, vcc + + // Repeat until branched out. + s_branch L_COPY_ALIGNED_PHASE_1_LOOP + + L_COPY_ALIGNED_PHASE_1_DONE: + // Restore EXEC mask for all lanes. + s_mov_b64 exec, 0xFFFFFFFFFFFFFFFF + + // ======================================================== + // Phase 2: Unrolled dword[x4] copy up to last whole block. + // ======================================================== + + // Compute unrolled dword[x4] stride across all threads. + if kCopyAlignedVecWidth == 4 + s_lshl_b32 s25, s24, 0x4 + else + s_lshl_b32 s25, s24, 0x2 + end + + // Compute phase source address. + if kCopyAlignedVecWidth == 4 + v_lshlrev_b32 v1, 0x4, v0 + else + v_lshlrev_b32 v1, 0x2, v0 + end + + v_mov_b32 v3, s9 + v_add_u32 v2, vcc, v1, s8 + v_addc_u32 v3, vcc, v3, 0x0, vcc + + // Compute phase destination address. + v_mov_b32 v5, s11 + v_add_u32 v4, vcc, v1, s10 + v_addc_u32 v5, vcc, v5, 0x0, vcc + + L_COPY_ALIGNED_PHASE_2_LOOP: + // Branch out after phase end. + v_cmp_lt_u64 vcc, v[2:3], s[12:13] + s_cbranch_vccz L_COPY_ALIGNED_PHASE_2_DONE + + // Load from/advance the source address. + for var i = 0; i < kCopyAlignedUnroll; i ++ + if kCopyAlignedVecWidth == 4 + flat_load_dwordx4 v[8 + (i * 4)], v[2:3] + else + flat_load_dword v[8 + i], v[2:3] + end + + v_add_u32 v2, vcc, v2, s25 + v_addc_u32 v3, vcc, v3, 0x0, vcc + end + + // Write to/advance the destination address. + s_waitcnt vmcnt(0) + + for var i = 0; i < kCopyAlignedUnroll; i ++ + if kCopyAlignedVecWidth == 4 + flat_store_dwordx4 v[4:5], v[8 + (i * 4)] + else + flat_store_dword v[4:5], v[8 + i] + end + + v_add_u32 v4, vcc, v4, s25 + v_addc_u32 v5, vcc, v5, 0x0, vcc + end + + // Repeat until branched out. + s_branch L_COPY_ALIGNED_PHASE_2_LOOP + + L_COPY_ALIGNED_PHASE_2_DONE: + + // =========================================== + // Phase 3: Dword copy up to last whole dword. + // =========================================== + + // Compute dword stride across all threads. + s_lshl_b32 s25, s24, 0x2 + + // Compute phase source address. + v_lshlrev_b32 v1, 0x2, v0 + v_mov_b32 v3, s13 + v_add_u32 v2, vcc, v1, s12 + v_addc_u32 v3, vcc, v3, 0x0, vcc + + // Compute phase destination address. + v_mov_b32 v5, s15 + v_add_u32 v4, vcc, v1, s14 + v_addc_u32 v5, vcc, v5, 0x0, vcc + + L_COPY_ALIGNED_PHASE_3_LOOP: + // Mask off lanes (or branch out) after phase end. + v_cmp_lt_u64 vcc, v[2:3], s[16:17] + s_cbranch_vccz L_COPY_ALIGNED_PHASE_3_DONE + s_and_b64 exec, exec, vcc + + // Load from/advance the source address. + flat_load_dword v1, v[2:3] + v_add_u32 v2, vcc, v2, s25 + v_addc_u32 v3, vcc, v3, 0x0, vcc + s_waitcnt vmcnt(0) + + // Write to/advance the destination address. + flat_store_dword v[4:5], v1 + v_add_u32 v4, vcc, v4, s25 + v_addc_u32 v5, vcc, v5, 0x0, vcc + + // Repeat until branched out. + s_branch L_COPY_ALIGNED_PHASE_3_LOOP + + L_COPY_ALIGNED_PHASE_3_DONE: + // Restore EXEC mask for all lanes. + s_mov_b64 exec, 0xFFFFFFFFFFFFFFFF + + // ============================= + // Phase 4: Byte copy up to end. + // ============================= + + // Compute phase source address. + v_mov_b32 v3, s17 + v_add_u32 v2, vcc, v0, s16 + v_addc_u32 v3, vcc, v3, 0x0, vcc + + // Compute phase destination address. + v_mov_b32 v5, s19 + v_add_u32 v4, vcc, v0, s18 + v_addc_u32 v5, vcc, v5, 0x0, vcc + + // Mask off lanes (or branch out) after phase end. + v_cmp_lt_u64 vcc, v[2:3], s[20:21] + s_cbranch_vccz L_COPY_ALIGNED_PHASE_4_DONE + s_and_b64 exec, exec, vcc + + // Load from the source address. + flat_load_ubyte v1, v[2:3] + s_waitcnt vmcnt(0) + + // Write to the destination address. + flat_store_byte v[4:5], v1 + + L_COPY_ALIGNED_PHASE_4_DONE: + s_endpgm + end + + // Memory copy for this case: + // (src_addr & 0x3) != (dst_addr & 0x3) + // + // Kernel argument buffer: + // [DW 0, 1] Phase 1 src start address + // [DW 2, 3] Phase 1 dst start address + // [DW 4, 5] Phase 2 src start address + // [DW 6, 7] Phase 2 dst start address + // [DW 8, 9] Phase 2 src end address + // [DW 10,11] Phase 2 dst end address + // [DW 12 ] Total number of workitems + + var kCopyMisalignedUnroll = 4 + + shader CopyMisaligned + type(CS) + user_sgpr_count(2) + sgpr_count(23) + vgpr_count(6 + kCopyMisalignedUnroll) + + // Retrieve kernel arguments. + s_load_dwordx4 s[4:7], s[0:1], s_load_dword_offset(0x0) + s_load_dwordx4 s[8:11], s[0:1], s_load_dword_offset(0x10) + s_load_dwordx4 s[12:15], s[0:1], s_load_dword_offset(0x20) + s_load_dword s16, s[0:1], s_load_dword_offset(0x30) + s_waitcnt lgkmcnt(0) + + // Compute workitem id. + s_lshl_b32 s2, s2, 0x6 + v_add_u32 v0, vcc, s2, v0 + + // =================================================== + // Phase 1: Unrolled byte copy up to last whole block. + // =================================================== + + // Compute phase source address. + v_mov_b32 v3, s5 + v_add_u32 v2, vcc, v0, s4 + v_addc_u32 v3, vcc, v3, 0x0, vcc + + // Compute phase destination address. + v_mov_b32 v5, s7 + v_add_u32 v4, vcc, v0, s6 + v_addc_u32 v5, vcc, v5, 0x0, vcc + + L_COPY_MISALIGNED_PHASE_1_LOOP: + // Branch out after phase end. + v_cmp_lt_u64 vcc, v[2:3], s[8:9] + s_cbranch_vccz L_COPY_MISALIGNED_PHASE_1_DONE + + // Load from/advance the source address. + for var i = 0; i < kCopyMisalignedUnroll; i ++ + flat_load_ubyte v[6 + i], v[2:3] + v_add_u32 v2, vcc, v2, s16 + v_addc_u32 v3, vcc, v3, 0x0, vcc + end + + // Write to/advance the destination address. + s_waitcnt vmcnt(0) + + for var i = 0; i < kCopyMisalignedUnroll; i ++ + flat_store_byte v[4:5], v[6 + i] + v_add_u32 v4, vcc, v4, s16 + v_addc_u32 v5, vcc, v5, 0x0, vcc + end + + // Repeat until branched out. + s_branch L_COPY_MISALIGNED_PHASE_1_LOOP + + L_COPY_MISALIGNED_PHASE_1_DONE: + + // ============================= + // Phase 2: Byte copy up to end. + // ============================= + + // Compute phase source address. + v_mov_b32 v3, s9 + v_add_u32 v2, vcc, v0, s8 + v_addc_u32 v3, vcc, v3, 0x0, vcc + + // Compute phase destination address. + v_mov_b32 v5, s11 + v_add_u32 v4, vcc, v0, s10 + v_addc_u32 v5, vcc, v5, 0x0, vcc + + L_COPY_MISALIGNED_PHASE_2_LOOP: + // Mask off lanes (or branch out) after phase end. + v_cmp_lt_u64 vcc, v[2:3], s[12:13] + s_cbranch_vccz L_COPY_MISALIGNED_PHASE_2_DONE + s_and_b64 exec, exec, vcc + + // Load from/advance the source address. + flat_load_ubyte v1, v[2:3] + v_add_u32 v2, vcc, v2, s16 + v_addc_u32 v3, vcc, v3, 0x0, vcc + s_waitcnt vmcnt(0) + + // Write to/advance the destination address. + flat_store_byte v[4:5], v1 + v_add_u32 v4, vcc, v4, s16 + v_addc_u32 v5, vcc, v5, 0x0, vcc + + // Repeat until branched out. + s_branch L_COPY_MISALIGNED_PHASE_2_LOOP + + L_COPY_MISALIGNED_PHASE_2_DONE: + s_endpgm + end + + // Memory fill for dword-aligned region. + // + // Kernel argument buffer: + // [DW 0, 1] Phase 1 dst start address + // [DW 2, 3] Phase 2 dst start address + // [DW 4, 5] Phase 2 dst end address + // [DW 6 ] Value to fill memory with + // [DW 7 ] Total number of workitems + + var kFillVecWidth = 4 + var kFillUnroll = 1 + + shader Fill + type(CS) + user_sgpr_count(2) + sgpr_count(19) + vgpr_count(8) + + // Retrieve kernel arguments. + s_load_dwordx4 s[4:7], s[0:1], s_load_dword_offset(0x0) + s_load_dwordx4 s[8:11], s[0:1], s_load_dword_offset(0x10) + s_waitcnt lgkmcnt(0) + + // Compute workitem id. + s_lshl_b32 s2, s2, 0x6 + v_add_u32 v0, vcc, s2, v0 + + // Copy fill pattern into VGPRs. + for var i = 0; i < kFillVecWidth; i ++ + v_mov_b32 v[4 + i], s10 + end + + // ======================================================== + // Phase 1: Unrolled dword[x4] fill up to last whole block. + // ======================================================== + + // Compute unrolled dword[x4] stride across all threads. + if kFillVecWidth == 4 + s_lshl_b32 s12, s11, 0x4 + else + s_lshl_b32 s12, s11, 0x2 + end + + // Compute phase destination address. + if kFillVecWidth == 4 + v_lshlrev_b32 v1, 0x4, v0 + else + v_lshlrev_b32 v1, 0x2, v0 + end + + v_mov_b32 v3, s5 + v_add_u32 v2, vcc, v1, s4 + v_addc_u32 v3, vcc, v3, 0x0, vcc + + L_FILL_PHASE_1_LOOP: + // Branch out after phase end. + v_cmp_lt_u64 vcc, v[2:3], s[6:7] + s_cbranch_vccz L_FILL_PHASE_1_DONE + + // Write to/advance the destination address. + for var i = 0; i < kFillUnroll; i ++ + if kFillVecWidth == 4 + flat_store_dwordx4 v[2:3], v[4:7] + else + flat_store_dword v[2:3], v4 + end + + v_add_u32 v2, vcc, v2, s12 + v_addc_u32 v3, vcc, v3, 0x0, vcc + end + + // Repeat until branched out. + s_branch L_FILL_PHASE_1_LOOP + + L_FILL_PHASE_1_DONE: + + // ============================== + // Phase 2: Dword fill up to end. + // ============================== + + // Compute dword stride across all threads. + s_lshl_b32 s12, s11, 0x2 + + // Compute phase destination address. + v_lshlrev_b32 v1, 0x2, v0 + v_mov_b32 v3, s7 + v_add_u32 v2, vcc, v1, s6 + v_addc_u32 v3, vcc, v3, 0x0, vcc + + L_FILL_PHASE_2_LOOP: + // Mask off lanes (or branch out) after phase end. + v_cmp_lt_u64 vcc, v[2:3], s[8:9] + s_cbranch_vccz L_FILL_PHASE_2_DONE + s_and_b64 exec, exec, vcc + + // Write to/advance the destination address. + flat_store_dword v[2:3], v4 + v_add_u32 v2, vcc, v2, s12 + v_addc_u32 v3, vcc, v3, 0x0, vcc + + // Repeat until branched out. + s_branch L_FILL_PHASE_2_LOOP + + L_FILL_PHASE_2_DONE: + s_endpgm + end +)"); + +// Search kernel source for variable definition and return value. +int GetKernelSourceParam(const char* paramName) { + std::stringstream paramDef; + paramDef << "var " << paramName << " = "; + + std::string::size_type paramDefLoc = kBlitKernelSource.find(paramDef.str()); + assert(paramDefLoc != std::string::npos); + std::string::size_type paramValLoc = paramDefLoc + paramDef.str().size(); + std::string::size_type paramEndLoc = + kBlitKernelSource.find('\n', paramDefLoc); + assert(paramDefLoc != std::string::npos); + + std::string paramVal(&kBlitKernelSource[paramValLoc], + &kBlitKernelSource[paramEndLoc]); + return std::stoi(paramVal); +} + +static int kCopyAlignedVecWidth = GetKernelSourceParam("kCopyAlignedVecWidth"); +static int kCopyAlignedUnroll = GetKernelSourceParam("kCopyAlignedUnroll"); +static int kCopyMisalignedUnroll = GetKernelSourceParam("kCopyMisalignedUnroll"); +static int kFillVecWidth = GetKernelSourceParam("kFillVecWidth"); +static int kFillUnroll = GetKernelSourceParam("kFillUnroll"); + BlitKernel::BlitKernel() : core::Blit(), - copy_code_handle_(0), - fill_code_handle_(0), queue_(NULL), cached_index_(0), kernarg_async_(NULL), kernarg_async_mask_(0), kernarg_async_counter_(0), - code_arg_buffer_(NULL) { + num_cus_(0) { completion_signal_.handle = 0; } @@ -96,26 +542,8 @@ hsa_status_t BlitKernel::Initialize(const core::Agent& agent) { return HSA_STATUS_ERROR; } - // Need queue buffer that can cover the max size of local memory. - const uint64_t kGpuVmVaSize = 1ULL << 40; - const uint32_t kRequiredQueueSize = NextPow2(static_cast( - std::ceil(static_cast(kGpuVmVaSize) / kMaxCopyCount))); - - uint32_t max_queue_size = 0; - status = HSA::hsa_agent_get_info(agent_handle, HSA_AGENT_INFO_QUEUE_MAX_SIZE, - &max_queue_size); - - if (HSA_STATUS_SUCCESS != status) { - return status; - } - - if (max_queue_size < kRequiredQueueSize) { - return HSA_STATUS_ERROR_OUT_OF_RESOURCES; - } - - status = - HSA::hsa_queue_create(agent_handle, kRequiredQueueSize, - HSA_QUEUE_TYPE_MULTI, NULL, NULL, 0, 0, &queue_); + status = HSA::hsa_queue_create(agent_handle, 1024, HSA_QUEUE_TYPE_MULTI, NULL, + NULL, 0, 0, &queue_); if (HSA_STATUS_SUCCESS != status) { return status; @@ -125,86 +553,6 @@ hsa_status_t BlitKernel::Initialize(const core::Agent& agent) { cached_index_ = 0; - void* copy_raw_obj_mem = NULL; - size_t copy_akc_size = 0; - size_t copy_akc_offset = 0; - - void* copy_aligned_raw_obj_mem = NULL; - size_t copy_aligned_akc_size = 0; - size_t copy_aligned_akc_offset = 0; - - void* fill_raw_obj_mem = NULL; - size_t fill_akc_size = 0; - size_t fill_akc_offset = 0; - - switch (agent.isa()->GetMajorVersion()) { - case 7: - copy_raw_obj_mem = kVectorCopyKvObject; - copy_akc_size = HSA_VECTOR_COPY_KV_AKC_SIZE; - copy_akc_offset = HSA_VECTOR_COPY_KV_AKC_OFFSET; - - copy_aligned_raw_obj_mem = kVectorCopyAlignedKvObject; - copy_aligned_akc_size = HSA_VECTOR_COPY_ALIGNED_KV_AKC_SIZE; - copy_aligned_akc_offset = HSA_VECTOR_COPY_ALIGNED_KV_AKC_OFFSET; - - fill_raw_obj_mem = kFillMemoryKvObject; - fill_akc_size = HSA_FILL_MEMORY_KV_AKC_SIZE; - fill_akc_offset = HSA_FILL_MEMORY_KV_AKC_OFFSET; - break; - case 8: - copy_raw_obj_mem = kVectorCopyViObject; - copy_akc_size = HSA_VECTOR_COPY_VI_AKC_SIZE; - copy_akc_offset = HSA_VECTOR_COPY_VI_AKC_OFFSET; - - copy_aligned_raw_obj_mem = kVectorCopyAlignedViObject; - copy_aligned_akc_size = HSA_VECTOR_COPY_ALIGNED_VI_AKC_SIZE; - copy_aligned_akc_offset = HSA_VECTOR_COPY_ALIGNED_VI_AKC_OFFSET; - - fill_raw_obj_mem = kFillMemoryViObject; - fill_akc_size = HSA_FILL_MEMORY_VI_AKC_SIZE; - fill_akc_offset = HSA_FILL_MEMORY_VI_AKC_OFFSET; - break; - default: - assert(false && "Only gfx7 and gfx8 are supported"); - break; - } - - const size_t total_alloc_size = AlignUp( - AlignUp(copy_akc_size, 256) + AlignUp(copy_aligned_akc_size, 256) + - AlignUp(fill_akc_size, 256), - 4096); - - amd_kernel_code_t *code_ptr = nullptr; - code_arg_buffer_ = core::Runtime::runtime_singleton_->system_allocator()( - total_alloc_size, 4096); - - char* akc_arg = reinterpret_cast(code_arg_buffer_); - memcpy(akc_arg, - reinterpret_cast(copy_raw_obj_mem) + copy_akc_offset, - copy_akc_size); - copy_code_handle_ = reinterpret_cast(akc_arg); - code_ptr = (amd_kernel_code_t*)(copy_code_handle_); - code_ptr->runtime_loader_kernel_symbol = 0; - akc_arg += copy_akc_size; - - akc_arg = AlignUp(akc_arg, 256); - memcpy(akc_arg, reinterpret_cast(copy_aligned_raw_obj_mem) + - copy_aligned_akc_offset, - copy_aligned_akc_size); - copy_aligned_code_handle_ = reinterpret_cast(akc_arg); - code_ptr = (amd_kernel_code_t*)(copy_aligned_code_handle_); - code_ptr->runtime_loader_kernel_symbol = 0; - akc_arg += copy_aligned_akc_size; - - akc_arg = AlignUp(akc_arg, 256); - memcpy(akc_arg, - reinterpret_cast(fill_raw_obj_mem) + fill_akc_offset, - fill_akc_size); - fill_code_handle_ = reinterpret_cast(akc_arg); - code_ptr = (amd_kernel_code_t*)(fill_code_handle_); - code_ptr->runtime_loader_kernel_symbol = 0; - akc_arg += fill_akc_size; - status = HSA::hsa_signal_create(1, 0, NULL, &completion_signal_); if (HSA_STATUS_SUCCESS != status) { return status; @@ -212,34 +560,40 @@ hsa_status_t BlitKernel::Initialize(const core::Agent& agent) { kernarg_async_ = reinterpret_cast( core::Runtime::runtime_singleton_->system_allocator()( - kRequiredQueueSize * AlignUp(sizeof(KernelArgs), 16), 16)); + queue_->size * AlignUp(sizeof(KernelArgs), 16), 16)); - kernarg_async_mask_ = kRequiredQueueSize - 1; + kernarg_async_mask_ = queue_->size - 1; - // TODO: remove this code when execute permission level is not mandatory. - if (((amd::GpuAgent&)agent).profile() == HSA_PROFILE_FULL) { -#if defined(_WIN32) || defined(_WIN64) -#define NOMINMAX - DWORD old_protect = 0; - const DWORD new_protect = PAGE_EXECUTE_READWRITE; - if (!VirtualProtect(code_arg_buffer_, total_alloc_size, new_protect, - &old_protect)) { - return HSA_STATUS_ERROR_OUT_OF_RESOURCES; - } -#else - if (0 != mprotect(code_arg_buffer_, total_alloc_size, - PROT_READ | PROT_WRITE | PROT_EXEC)) { - return HSA_STATUS_ERROR_OUT_OF_RESOURCES; - } -#endif + // Obtain the number of compute units in the underlying agent. + const GpuAgent& gpuAgent = static_cast(agent); + num_cus_ = gpuAgent.properties().NumFComputeCores / 4; + + // Assemble shaders to AQL code objects. + std::map kernel_names = { + {KernelType::CopyAligned, "CopyAligned"}, + {KernelType::CopyMisaligned, "CopyMisaligned"}, + {KernelType::Fill, "Fill"}}; + + for (auto kernel_name : kernel_names) { + KernelCode& kernel = kernels_[kernel_name.first]; + gpuAgent.AssembleShader(kBlitKernelSource.c_str(), kernel_name.second, + GpuAgent::AssembleTarget::AQL, kernel.code_buf_, + kernel.code_buf_size_); } return HSA_STATUS_SUCCESS; } -hsa_status_t BlitKernel::Destroy(void) { +hsa_status_t BlitKernel::Destroy(const core::Agent& agent) { std::lock_guard guard(lock_); + const GpuAgent& gpuAgent = static_cast(agent); + + for (auto kernel_pair : kernels_) { + gpuAgent.ReleaseShader(kernel_pair.second.code_buf_, + kernel_pair.second.code_buf_size_); + } + if (queue_ != NULL) { HSA::hsa_queue_destroy(queue_); } @@ -248,10 +602,6 @@ hsa_status_t BlitKernel::Destroy(void) { core::Runtime::runtime_singleton_->system_deallocator()(kernarg_async_); } - if (code_arg_buffer_ != NULL) { - core::Runtime::runtime_singleton_->system_deallocator()(code_arg_buffer_); - } - if (completion_signal_.handle != 0) { HSA::hsa_signal_destroy(completion_signal_); } @@ -259,11 +609,6 @@ hsa_status_t BlitKernel::Destroy(void) { return HSA_STATUS_SUCCESS; } -static bool IsSystemMemory(void* address) { - static const uint64_t kLimitSystem = 1ULL << 48; - return (reinterpret_cast(address) < kLimitSystem); -} - hsa_status_t BlitKernel::SubmitLinearCopyCommand(void* dst, const void* src, size_t size) { // Protect completion_signal_. @@ -294,35 +639,14 @@ hsa_status_t BlitKernel::SubmitLinearCopyCommand(void* dst, const void* src, hsa_status_t BlitKernel::SubmitLinearCopyCommand( void* dst, const void* src, size_t size, std::vector& dep_signals, core::Signal& out_signal) { - assert(copy_code_handle_ != 0); - - const size_t kAlignmentChar = 1; - const size_t kAlignmentUin32 = 4; - const size_t kAlignmentVec4 = 16; - const size_t copy_granule = - (IsMultipleOf(dst, kAlignmentVec4) && IsMultipleOf(src, kAlignmentVec4) && - IsMultipleOf(size, kAlignmentVec4)) - ? kAlignmentVec4 - : (IsMultipleOf(dst, kAlignmentUin32) && - IsMultipleOf(src, kAlignmentUin32) && - IsMultipleOf(size, kAlignmentUin32)) - ? kAlignmentUin32 - : kAlignmentChar; - - size = size / copy_granule; - - const uint32_t num_copy_packet = static_cast( - std::ceil(static_cast(size) / kMaxCopyCount)); - - const uint32_t num_barrier_packet = - static_cast(std::ceil(dep_signals.size() / 5.0f)); - - // Reserve write index for copy + fence packet. - const uint32_t total_num_packet = num_barrier_packet + num_copy_packet; + // Reserve write index for barrier(s) + dispatch packet. + const uint32_t num_barrier_packet = uint32_t((dep_signals.size() + 4) / 5); + const uint32_t total_num_packet = num_barrier_packet + 1; uint64_t write_index = AcquireWriteIndex(total_num_packet); uint64_t write_index_temp = write_index; + // Insert barrier packets to handle dependent signals. const uint16_t kBarrierPacketHeader = (HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE) | (1 << HSA_PACKET_HEADER_BARRIER) | @@ -352,99 +676,116 @@ hsa_status_t BlitKernel::SubmitLinearCopyCommand( } } - const uint32_t last_copy_index = num_copy_packet - 1; - size_t total_copy_count = 0; - for (uint32_t i = 0; i < num_copy_packet; ++i) { - // Setup arguments. - const uint32_t copy_count = static_cast( - std::min((size - total_copy_count), kMaxCopyCount)); + // Insert dispatch packet for copy kernel. + KernelArgs* args = ObtainAsyncKernelCopyArg(); + KernelCode* kernel_code = nullptr; + int num_workitems = 0; - void* cur_dst = static_cast(dst) + (total_copy_count * copy_granule); - const void* cur_src = - static_cast(src) + (total_copy_count * copy_granule); + bool aligned = ((uintptr_t(src) & 0x3) == (uintptr_t(dst) & 0x3)); - KernelArgs* args = ObtainAsyncKernelCopyArg(); - assert(args != NULL); - assert(IsMultipleOf(&args->copy, 16)); + if (aligned) { + // Use dword-based aligned kernel. + kernel_code = &kernels_[KernelType::CopyAligned]; - args->copy.src = cur_src; - args->copy.dst = cur_dst; - args->copy.size = copy_count; - args->copy.use_vector = (copy_granule == kAlignmentVec4) ? 1 : 0; + // Compute the size of each copy phase. + num_workitems = 64 * 4 * num_cus_; - const uint32_t grid_size_x = - AlignUp(static_cast(copy_count), kGroupSize); + // Phase 1 (byte copy) ends when destination is 0x100-aligned. + uintptr_t src_start = uintptr_t(src); + uintptr_t dst_start = uintptr_t(dst); + uint64_t phase1_size = + std::min(size, uint64_t(0x100 - (dst_start & 0xFF)) & 0xFF); - // This assert to make sure kMaxCopySize is not changed to a number that - // could cause overflow to packet.grid_size_x. - assert(grid_size_x >= copy_count); + // Phase 2 (unrolled dwordx4 copy) ends when last whole block fits. + uint64_t phase2_block = num_workitems * sizeof(uint32_t) * + kCopyAlignedUnroll * kCopyAlignedVecWidth; + uint64_t phase2_size = ((size - phase1_size) / phase2_block) * phase2_block; - hsa_signal_t signal = {(i == last_copy_index) - ? (core::Signal::Convert(&out_signal)).handle - : 0}; - PopulateQueue(write_index, ((copy_granule == kAlignmentChar) - ? copy_code_handle_ - : copy_aligned_code_handle_), - args, grid_size_x, signal); + // Phase 3 (dword copy) ends when last whole dword fits. + uint64_t phase3_size = + ((size - phase1_size - phase2_size) / sizeof(uint32_t)) * + sizeof(uint32_t); - ++write_index; + args->copy_aligned.phase1_src_start = src_start; + args->copy_aligned.phase1_dst_start = dst_start; + args->copy_aligned.phase2_src_start = src_start + phase1_size; + args->copy_aligned.phase2_dst_start = dst_start + phase1_size; + args->copy_aligned.phase3_src_start = src_start + phase1_size + phase2_size; + args->copy_aligned.phase3_dst_start = dst_start + phase1_size + phase2_size; + args->copy_aligned.phase4_src_start = + src_start + phase1_size + phase2_size + phase3_size; + args->copy_aligned.phase4_dst_start = + dst_start + phase1_size + phase2_size + phase3_size; + args->copy_aligned.phase4_src_end = src_start + size; + args->copy_aligned.phase4_dst_end = dst_start + size; + args->copy_aligned.num_workitems = num_workitems; + } else { + // Use byte-based misaligned kernel. + kernel_code = &kernels_[KernelType::CopyMisaligned]; - total_copy_count += copy_count; + // Compute the size of each copy phase. + num_workitems = 64 * 4 * num_cus_; + + // Phase 1 (unrolled byte copy) ends when last whole block fits. + uintptr_t src_start = uintptr_t(src); + uintptr_t dst_start = uintptr_t(dst); + uint64_t phase1_block = + num_workitems * sizeof(uint8_t) * kCopyMisalignedUnroll; + uint64_t phase1_size = (size / phase1_block) * phase1_block; + + args->copy_misaligned.phase1_src_start = src_start; + args->copy_misaligned.phase1_dst_start = dst_start; + args->copy_misaligned.phase2_src_start = src_start + phase1_size; + args->copy_misaligned.phase2_dst_start = dst_start + phase1_size; + args->copy_misaligned.phase2_src_end = src_start + size; + args->copy_misaligned.phase2_dst_end = dst_start + size; + args->copy_misaligned.num_workitems = num_workitems; } - // Launch copy packet. + hsa_signal_t signal = {(core::Signal::Convert(&out_signal)).handle}; + PopulateQueue(write_index, uintptr_t(kernel_code->code_buf_), args, + num_workitems, signal); + + // Submit barrier(s) and dispatch packets. ReleaseWriteIndex(write_index_temp, total_num_packet); return HSA_STATUS_SUCCESS; } hsa_status_t BlitKernel::SubmitLinearFillCommand(void* ptr, uint32_t value, - size_t num) { - assert(fill_code_handle_ != 0); - + size_t count) { std::lock_guard guard(lock_); - HSA::hsa_signal_store_relaxed(completion_signal_, 1); - - const uint32_t num_fill_packet = static_cast( - std::ceil(static_cast(num) / kMaxFillCount)); - - // Reserve write index for copy + fence packet. - uint64_t write_index = AcquireWriteIndex(num_fill_packet); - - const uint32_t last_fill_index = num_fill_packet - 1; - size_t total_fill_count = 0; - for (uint32_t i = 0; i < num_fill_packet; ++i) { - // Setup arguments. - const uint32_t fill_count = static_cast( - std::min((num - total_fill_count), kMaxFillCount)); - void* cur_ptr = static_cast(ptr) + total_fill_count; - - KernelArgs* args = ObtainAsyncKernelCopyArg(); - assert(args != NULL); - assert(IsMultipleOf(&args->fill, 16)); - - args->fill.ptr = cur_ptr; - args->fill.num = fill_count; - args->fill.value = value; - - const uint32_t grid_size_x = - AlignUp(static_cast(fill_count), kGroupSize); - - // This assert to make sure kMaxFillCount is not changed to a number that - // could cause overflow to packet.grid_size_x. - assert(grid_size_x >= fill_count); - - hsa_signal_t signal = {(i == last_fill_index) ? completion_signal_.handle - : 0}; - PopulateQueue(write_index + i, fill_code_handle_, &args[i], grid_size_x, - signal); - - total_fill_count += fill_count; + // Reject misaligned base address. + if ((uintptr_t(ptr) & 0x3) != 0) { + return HSA_STATUS_ERROR; } - // Launch fill packet. - ReleaseWriteIndex(write_index, num_fill_packet); + // Compute the size of each fill phase. + int num_workitems = 64 * num_cus_; + + // Phase 1 (unrolled dwordx4 copy) ends when last whole block fits. + uintptr_t dst_start = uintptr_t(ptr); + uint64_t fill_size = count * sizeof(uint32_t); + + uint64_t phase1_block = + num_workitems * sizeof(uint32_t) * kFillUnroll * kFillVecWidth; + uint64_t phase1_size = (fill_size / phase1_block) * phase1_block; + + KernelArgs* args = ObtainAsyncKernelCopyArg(); + args->fill.phase1_dst_start = dst_start; + args->fill.phase2_dst_start = dst_start + phase1_size; + args->fill.phase2_dst_end = dst_start + fill_size; + args->fill.fill_value = value; + args->fill.num_workitems = num_workitems; + + // Submit dispatch packet. + HSA::hsa_signal_store_relaxed(completion_signal_, 1); + + uint64_t write_index = AcquireWriteIndex(1); + PopulateQueue(write_index, uintptr_t(kernels_[KernelType::Fill].code_buf_), + args, num_workitems, completion_signal_); + ReleaseWriteIndex(write_index, 1); // Wait for the packet to finish. if (HSA::hsa_signal_wait_acquire(completion_signal_, HSA_SIGNAL_CONDITION_LT, @@ -556,9 +897,9 @@ void BlitKernel::PopulateQueue(uint64_t index, uint64_t code_handle, void* args, // Setup working size. const int kNumDimension = 1; packet.setup = kNumDimension << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS; - packet.grid_size_x = AlignUp(static_cast(grid_size_x), kGroupSize); + packet.grid_size_x = AlignUp(static_cast(grid_size_x), 64); packet.grid_size_y = packet.grid_size_z = 1; - packet.workgroup_size_x = kGroupSize; + packet.workgroup_size_x = 64; packet.workgroup_size_y = packet.workgroup_size_z = 1; packet.completion_signal = completion_signal; diff --git a/runtime/hsa-runtime/core/runtime/amd_blit_sdma.cpp b/runtime/hsa-runtime/core/runtime/amd_blit_sdma.cpp index b89ba76273..ff9378e745 100644 --- a/runtime/hsa-runtime/core/runtime/amd_blit_sdma.cpp +++ b/runtime/hsa-runtime/core/runtime/amd_blit_sdma.cpp @@ -404,7 +404,7 @@ hsa_status_t BlitSdma::Initialize(const core::Agent& agent) { if (err != HSAKMT_STATUS_SUCCESS) { assert(false && "AQL queue memory map failure."); - Destroy(); + Destroy(agent); return HSA_STATUS_ERROR_OUT_OF_RESOURCES; } @@ -419,7 +419,7 @@ hsa_status_t BlitSdma::Initialize(const core::Agent& agent) { hsaKmtCreateQueue(gpu_agent.node_id(), kQueueType_, 100, HSA_QUEUE_PRIORITY_MAXIMUM, queue_start_addr_, queue_size_, NULL, &queue_resource_)) { - Destroy(); + Destroy(agent); return HSA_STATUS_ERROR_OUT_OF_RESOURCES; } @@ -436,14 +436,14 @@ hsa_status_t BlitSdma::Initialize(const core::Agent& agent) { fence_pool_size_ * sizeof(uint32_t), 256)); if (fence_base_addr_ == NULL) { - Destroy(); + Destroy(agent); return HSA_STATUS_ERROR_OUT_OF_RESOURCES; } return HSA_STATUS_SUCCESS; } -hsa_status_t BlitSdma::Destroy(void) { +hsa_status_t BlitSdma::Destroy(const core::Agent& agent) { // Release all allocated resources and reset them to zero. if (queue_resource_.QueueId != 0) { diff --git a/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp b/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp index 244f7eaf53..49361b1f40 100644 --- a/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp +++ b/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp @@ -46,6 +46,7 @@ #include #include #include +#include #include #include "core/inc/amd_aql_queue.h" @@ -126,7 +127,7 @@ GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props) GpuAgent::~GpuAgent() { if (blit_h2d_ != NULL) { - hsa_status_t status = blit_h2d_->Destroy(); + hsa_status_t status = blit_h2d_->Destroy(*this); assert(status == HSA_STATUS_SUCCESS); delete blit_h2d_; @@ -134,7 +135,7 @@ GpuAgent::~GpuAgent() { } if (blit_d2h_ != NULL) { - hsa_status_t status = blit_d2h_->Destroy(); + hsa_status_t status = blit_d2h_->Destroy(*this); assert(status == HSA_STATUS_SUCCESS); delete blit_d2h_; @@ -158,33 +159,51 @@ GpuAgent::~GpuAgent() { } void GpuAgent::AssembleShader(const char* src_sp3, const char* func_name, - void*& code_buf, size_t& code_buf_size) { + AssembleTarget assemble_target, void*& code_buf, + size_t& code_buf_size) const { #ifdef __linux__ // No VS builds of libsp3 available right now + std::string src_sp3_unified(src_sp3); + + if (isa_->GetMajorVersion() == 7) { + // On Gfx7 replace v_add_u32 with legacy equivalent v_add_i32. + std::string add_inst_gfx8("v_add_u32"), add_inst_gfx7("v_add_i32"); + + for (size_t instIdx = 0; (instIdx = src_sp3_unified.find( + add_inst_gfx8, instIdx)) != std::string::npos; + instIdx += add_inst_gfx8.size()) { + src_sp3_unified.replace(instIdx, add_inst_gfx7.size(), add_inst_gfx7); + } + } + // Assemble source string with libsp3. sp3_context* sp3 = sp3_new(); switch (isa_->GetMajorVersion()) { case 7: sp3_setasic(sp3, "CI"); + sp3_set_param_int(sp3, "kGFXIPVersion", 7); break; case 8: sp3_setasic(sp3, "VI"); + sp3_set_param_int(sp3, "kGFXIPVersion", 8); break; default: assert(false && "SP3 assembly not supported on this agent"); } - sp3_parse_string(sp3, src_sp3); + sp3_parse_string(sp3, src_sp3_unified.c_str()); sp3_shader* code_sp3_meta = sp3_compile(sp3, func_name); - // Allocate a GPU-visible buffer for the trap shader. + // Allocate a GPU-visible buffer for the shader. HsaMemFlags code_buf_flags = {0}; code_buf_flags.ui32.HostAccess = 1; code_buf_flags.ui32.ExecuteAccess = 1; code_buf_flags.ui32.NoSubstitute = 1; size_t code_size = code_sp3_meta->size * sizeof(uint32_t); - code_buf_size = AlignUp(code_size, 0x1000); + size_t header_size = + (assemble_target == AssembleTarget::AQL ? sizeof(amd_kernel_code_t) : 0); + code_buf_size = AlignUp(header_size + code_size, 0x1000); HSAKMT_STATUS err = hsaKmtAllocMemory(node_id(), code_buf_size, code_buf_flags, &code_buf); @@ -193,9 +212,38 @@ void GpuAgent::AssembleShader(const char* src_sp3, const char* func_name, err = hsaKmtMapMemoryToGPU(code_buf, code_buf_size, NULL); assert(err == HSAKMT_STATUS_SUCCESS && "hsaKmtMapMemoryToGPU(Trap) failed"); - // Copy trap handler code into the GPU-visible buffer. memset(code_buf, 0, code_buf_size); - memcpy(code_buf, code_sp3_meta->data, code_size); + + // Populate optional code object header. + if (assemble_target == AssembleTarget::AQL) { + amd_kernel_code_t* header = reinterpret_cast(code_buf); + + int gran_sgprs = std::max(0, (int(code_sp3_meta->nsgprs) - 1) / 8); + int gran_vgprs = std::max(0, (int(code_sp3_meta->nvgprs) - 1) / 4); + + header->kernel_code_entry_byte_offset = sizeof(amd_kernel_code_t); + AMD_HSA_BITS_SET(header->kernel_code_properties, + AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_KERNARG_SEGMENT_PTR, + 1); + AMD_HSA_BITS_SET(header->compute_pgm_rsrc1, + AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WAVEFRONT_SGPR_COUNT, + gran_sgprs); + AMD_HSA_BITS_SET(header->compute_pgm_rsrc1, + AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WORKITEM_VGPR_COUNT, + gran_vgprs); + AMD_HSA_BITS_SET(header->compute_pgm_rsrc1, + AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_DENORM_MODE_16_64, 3); + AMD_HSA_BITS_SET(header->compute_pgm_rsrc1, + AMD_COMPUTE_PGM_RSRC_ONE_ENABLE_IEEE_MODE, 1); + AMD_HSA_BITS_SET(header->compute_pgm_rsrc2, + AMD_COMPUTE_PGM_RSRC_TWO_USER_SGPR_COUNT, 2); + AMD_HSA_BITS_SET(header->compute_pgm_rsrc2, + AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_X, 1); + } + + // Copy trap handler code into the GPU-visible buffer. + memcpy((void*)(uintptr_t(code_buf) + header_size), code_sp3_meta->data, + code_size); // Release SP3 resources. sp3_free_shader(code_sp3_meta); @@ -203,7 +251,7 @@ void GpuAgent::AssembleShader(const char* src_sp3, const char* func_name, #endif } -void GpuAgent::ReleaseShader(void* code_buf, size_t code_buf_size) { +void GpuAgent::ReleaseShader(void* code_buf, size_t code_buf_size) const { hsaKmtUnmapMemoryToGPU(code_buf); hsaKmtFreeMemory(code_buf, code_buf_size); } @@ -377,7 +425,7 @@ core::Blit* GpuAgent::CreateBlitSdma() { BlitSdma* sdma = new BlitSdma(); if (sdma->Initialize(*this) != HSA_STATUS_SUCCESS) { - sdma->Destroy(); + sdma->Destroy(*this); delete sdma; sdma = NULL; } @@ -389,7 +437,7 @@ core::Blit* GpuAgent::CreateBlitKernel() { BlitKernel* kernl = new BlitKernel(); if (kernl->Initialize(*this) != HSA_STATUS_SUCCESS) { - kernl->Destroy(); + kernl->Destroy(*this); delete kernl; kernl = NULL; } @@ -904,7 +952,8 @@ void GpuAgent::BindTrapHandler() { } // Assemble the trap handler source code. - AssembleShader(src_sp3, "TrapHandler", trap_code_buf_, trap_code_buf_size_); + AssembleShader(src_sp3, "TrapHandler", AssembleTarget::ISA, trap_code_buf_, + trap_code_buf_size_); // Bind the trap handler to this node. HSAKMT_STATUS err = hsaKmtSetTrapHandler(node_id(), trap_code_buf_,