Implement optimized blit/fill kernels
Replace HSAIL kernels with SP3 shaders. Support all alignment variations efficiently. Change-Id: Icf7f5471f3ba68389f55484d82f2805dd9bc3827
Этот коммит содержится в:
@@ -43,6 +43,7 @@
|
||||
#ifndef HSA_RUNTIME_CORE_INC_AMD_BLIT_KERNEL_H_
|
||||
#define HSA_RUNTIME_CORE_INC_AMD_BLIT_KERNEL_H_
|
||||
|
||||
#include <map>
|
||||
#include <stdint.h>
|
||||
|
||||
#include "core/inc/blit.h"
|
||||
@@ -66,8 +67,10 @@ class BlitKernel : public core::Blit {
|
||||
///
|
||||
/// @note: The call will block until all AQL packets have been executed.
|
||||
///
|
||||
/// @param agent Agent passed to Initialize.
|
||||
///
|
||||
/// @return hsa_status_t
|
||||
virtual hsa_status_t Destroy() override;
|
||||
virtual hsa_status_t Destroy(const core::Agent& agent) override;
|
||||
|
||||
/// @brief Submit an AQL packet to perform vector copy. The call is blocking
|
||||
/// until the command execution is finished.
|
||||
@@ -104,17 +107,36 @@ class BlitKernel : public core::Blit {
|
||||
|
||||
private:
|
||||
union KernelArgs {
|
||||
struct __ALIGNED__(16) KernelCopyArgs {
|
||||
const void* src;
|
||||
void* dst;
|
||||
uint64_t size;
|
||||
uint32_t use_vector;
|
||||
} copy;
|
||||
struct __ALIGNED__(16) {
|
||||
uint64_t phase1_src_start;
|
||||
uint64_t phase1_dst_start;
|
||||
uint64_t phase2_src_start;
|
||||
uint64_t phase2_dst_start;
|
||||
uint64_t phase3_src_start;
|
||||
uint64_t phase3_dst_start;
|
||||
uint64_t phase4_src_start;
|
||||
uint64_t phase4_dst_start;
|
||||
uint64_t phase4_src_end;
|
||||
uint64_t phase4_dst_end;
|
||||
uint32_t num_workitems;
|
||||
} copy_aligned;
|
||||
|
||||
struct __ALIGNED__(16) KernelFillArgs {
|
||||
void* ptr;
|
||||
uint64_t num;
|
||||
uint32_t value;
|
||||
struct __ALIGNED__(16) {
|
||||
uint64_t phase1_src_start;
|
||||
uint64_t phase1_dst_start;
|
||||
uint64_t phase2_src_start;
|
||||
uint64_t phase2_dst_start;
|
||||
uint64_t phase2_src_end;
|
||||
uint64_t phase2_dst_end;
|
||||
uint32_t num_workitems;
|
||||
} copy_misaligned;
|
||||
|
||||
struct __ALIGNED__(16) {
|
||||
uint64_t phase1_dst_start;
|
||||
uint64_t phase2_dst_start;
|
||||
uint64_t phase2_dst_end;
|
||||
uint32_t fill_value;
|
||||
uint32_t num_workitems;
|
||||
} fill;
|
||||
};
|
||||
|
||||
@@ -136,14 +158,19 @@ class BlitKernel : public core::Blit {
|
||||
|
||||
KernelArgs* ObtainAsyncKernelCopyArg();
|
||||
|
||||
/// Handles to the vector copy kernel.
|
||||
uint64_t copy_code_handle_;
|
||||
/// AQL code object and size for each kernel.
|
||||
enum class KernelType {
|
||||
CopyAligned,
|
||||
CopyMisaligned,
|
||||
Fill,
|
||||
};
|
||||
|
||||
/// Handles to the vector copy aligned kernel.
|
||||
uint64_t copy_aligned_code_handle_;
|
||||
struct KernelCode {
|
||||
void* code_buf_;
|
||||
size_t code_buf_size_;
|
||||
};
|
||||
|
||||
/// Handles to the fill memory kernel.
|
||||
uint64_t fill_code_handle_;
|
||||
std::map<KernelType, KernelCode> kernels_;
|
||||
|
||||
/// AQL queue for submitting the vector copy kernel.
|
||||
hsa_queue_t* queue_;
|
||||
@@ -163,12 +190,8 @@ class BlitKernel : public core::Blit {
|
||||
/// Lock to synchronize access to kernarg_ and completion_signal_
|
||||
std::mutex lock_;
|
||||
|
||||
/// Pointer to memory containing the ISA and argument buffer.
|
||||
void* code_arg_buffer_;
|
||||
|
||||
static const size_t kMaxCopyCount;
|
||||
static const size_t kMaxFillCount;
|
||||
static const uint32_t kGroupSize;
|
||||
/// Number of CUs on the underlying agent.
|
||||
int num_cus_;
|
||||
};
|
||||
} // namespace amd
|
||||
|
||||
|
||||
@@ -1,479 +0,0 @@
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// The University of Illinois/NCSA
|
||||
// Open Source License (NCSA)
|
||||
//
|
||||
// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved.
|
||||
//
|
||||
// Developed by:
|
||||
//
|
||||
// AMD Research and AMD HSA Software Development
|
||||
//
|
||||
// Advanced Micro Devices, Inc.
|
||||
//
|
||||
// www.amd.com
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to
|
||||
// deal with the Software without restriction, including without limitation
|
||||
// the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
// and/or sell copies of the Software, and to permit persons to whom the
|
||||
// Software is furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// - Redistributions of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimers.
|
||||
// - Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimers in
|
||||
// the documentation and/or other materials provided with the distribution.
|
||||
// - Neither the names of Advanced Micro Devices, Inc,
|
||||
// nor the names of its contributors may be used to endorse or promote
|
||||
// products derived from this Software without specific prior written
|
||||
// permission.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||
// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||
// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
// DEALINGS WITH THE SOFTWARE.
|
||||
//
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#ifndef HSA_RUNTIME_CORE_INC_AMD_BLIT_KERNEL_KV_H_
|
||||
#define HSA_RUNTIME_CORE_INC_AMD_BLIT_KERNEL_KV_H_
|
||||
|
||||
#include <stddef.h>
|
||||
|
||||
#define HSA_VECTOR_COPY_KV_AKC_SIZE 368
|
||||
#define HSA_VECTOR_COPY_KV_AKC_OFFSET 256
|
||||
|
||||
/*****HSAIL code of the ISA in ::kVectorCopyRawKv.
|
||||
module &m:1:0:$full:$large:$default;
|
||||
|
||||
prog kernel &__vector_copy_kernel(
|
||||
kernarg_u64 %src,
|
||||
kernarg_u64 %dst,
|
||||
kernarg_u64 %size)
|
||||
{
|
||||
@__vector_copy_kernel_entry:
|
||||
// BB#0: // %entry
|
||||
workitemabsid_u32 $s0, 0;
|
||||
cvt_u64_u32 $d0, $s0;
|
||||
ld_kernarg_align(8)_width(all)_u64 $d1, [%size];
|
||||
cmp_ge_b1_u64 $c0, $d0, $d1;
|
||||
cbr_b1 $c0, @BB0_2;
|
||||
// BB#1: // %if.end
|
||||
ld_kernarg_align(8)_width(all)_u64 $d1, [%src];
|
||||
ld_kernarg_align(8)_width(all)_u64 $d2, [%dst];
|
||||
add_u64 $d2, $d2, $d0;
|
||||
add_u64 $d0, $d1, $d0;
|
||||
ld_global_u8 $s0, [$d0];
|
||||
st_global_u8 $s0, [$d2];
|
||||
|
||||
@BB0_2:
|
||||
// %return
|
||||
ret;
|
||||
};
|
||||
*/
|
||||
|
||||
static char kVectorCopyRawKv[] = {
|
||||
127, 69, 76, 70, 2, 1, 1, 64, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 1, 0, -32, 0, 1, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 64, 0, 0, 0, 0, 0, 0,
|
||||
0, -104, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
64, 0, 56, 0, 1, 0, 64, 0, 6, 0, 5, 0, 3,
|
||||
0, 0, 96, 6, 0, 0, 0, 0, 1, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 112, 1, 0, 0, 0, 0, 0, 0,
|
||||
112, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
|
||||
0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 65, 0, -116, 0, -112, 0, 0, 0,
|
||||
11, 0, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 24, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 11, 0, 5, 0, 5, 0, 0, 0, 9, 0, 0,
|
||||
0, 0, 0, 0, 0, 3, 0, 0, 6, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 1, 5, 0, -64, 127, 0, -116, -65,
|
||||
0, -1, -128, -109, 0, 0, 16, 0, 0, 8, 0, -109, 0,
|
||||
0, 0, 74, 4, 7, 64, -64, -128, 2, 2, 126, 127, 0,
|
||||
-116, -65, 0, 0, -56, 125, 106, 36, -128, -66, 15, 0, -120,
|
||||
-65, 0, 7, -126, -64, 127, 0, -116, -65, 4, 0, 2, 74,
|
||||
5, 2, 4, 126, 2, 106, 80, -46, 2, 1, -87, 1, 0,
|
||||
0, 32, -36, 1, 0, 0, 1, 6, 0, 6, 74, 7, 2,
|
||||
4, 126, 4, 106, 80, -46, 2, 1, -87, 1, 112, 0, -116,
|
||||
-65, 0, 0, 96, -36, 3, 1, 0, 0, 0, 0, -127, -65,
|
||||
3, 0, 0, 0, 8, 0, 0, 0, 1, 0, 0, 0, 65,
|
||||
77, 68, 0, 1, 0, 0, 0, 0, 0, 0, 0, 3, 0,
|
||||
0, 0, 12, 0, 0, 0, 2, 0, 0, 0, 65, 77, 68,
|
||||
0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0,
|
||||
3, 0, 0, 0, 28, 0, 0, 0, 3, 0, 0, 0, 65,
|
||||
77, 68, 0, 4, 0, 7, 0, 7, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 65, 77, 68, 0, 65, 77, 68,
|
||||
71, 80, 85, 0, 0, 3, 0, 0, 0, 40, 0, 0, 0,
|
||||
4, 0, 0, 0, 65, 77, 68, 0, 26, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 65, 77, 68, 32, 72, 83,
|
||||
65, 32, 82, 117, 110, 116, 105, 109, 101, 32, 70, 105, 110,
|
||||
97, 108, 105, 122, 101, 114, 0, 0, 0, 38, 95, 95, 118,
|
||||
101, 99, 116, 111, 114, 95, 99, 111, 112, 121, 95, 107, 101,
|
||||
114, 110, 101, 108, 0, 95, 95, 104, 115, 97, 95, 115, 101,
|
||||
99, 116, 105, 111, 110, 46, 104, 115, 97, 116, 101, 120, 116,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 26, 0, 1, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 22, 0, 0, 0, 3, 0, 1, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 46, 104, 115, 97, 116, 101, 120, 116, 0, 46, 110,
|
||||
111, 116, 101, 0, 46, 115, 116, 114, 116, 97, 98, 0, 46,
|
||||
115, 121, 109, 116, 97, 98, 0, 46, 115, 104, 115, 116, 114,
|
||||
116, 97, 98, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
|
||||
1, 0, 0, 0, 7, 0, -64, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
|
||||
0, 0, 112, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 10, 0, 0, 0, 7,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 112, 2, 0, 0, 0, 0, 0,
|
||||
0, -104, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 16, 0, 0, 0, 3, 0,
|
||||
0, 0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 8, 3, 0, 0, 0, 0, 0, 0,
|
||||
44, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 24, 0, 0, 0, 2, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 56, 3, 0, 0, 0, 0, 0, 0, 48,
|
||||
0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0,
|
||||
0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 24, 0, 0,
|
||||
0, 0, 0, 0, 0, 32, 0, 0, 0, 3, 0, 0, 0,
|
||||
32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 104, 3, 0, 0, 0, 0, 0, 0, 42, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0,
|
||||
};
|
||||
extern char* const kVectorCopyKvObject = &kVectorCopyRawKv[0];
|
||||
extern size_t const kVectorCopyKvObjectSize = sizeof(kVectorCopyRawKv);
|
||||
|
||||
#define HSA_VECTOR_COPY_ALIGNED_KV_AKC_SIZE 436
|
||||
#define HSA_VECTOR_COPY_ALIGNED_KV_AKC_OFFSET 256
|
||||
|
||||
/*****HSAIL code of the ISA in ::kVectorCopyAlignedRawKv.
|
||||
module &m:1:0:$full:$large:$default;
|
||||
extension "amd:gcn";
|
||||
|
||||
prog kernel &__copy_buffer_aligned_kernel(
|
||||
kernarg_u64 %src,
|
||||
kernarg_u64 %dst,
|
||||
kernarg_u64 %size,
|
||||
kernarg_u32 %use_vector)
|
||||
{
|
||||
@__copy_buffer_aligned_kernel_entry:
|
||||
// BB#0: // %entry
|
||||
workitemabsid_u32 $s0, 0;
|
||||
cvt_u64_u32 $d0, $s0;
|
||||
ld_kernarg_align(8)_width(all)_u64 $d1, [%size];
|
||||
cmp_ge_b1_u64 $c0, $d0, $d1;
|
||||
cbr_b1 $c0, @LBB0_4;
|
||||
// BB#1: // %if.end
|
||||
ld_kernarg_align(8)_width(all)_u64 $d2, [%dst];
|
||||
ld_kernarg_align(8)_width(all)_u64 $d1, [%src];
|
||||
ld_kernarg_align(4)_width(all)_u32 $s0, [%use_vector];
|
||||
cmp_ne_b1_s32 $c0, $s0, 1;
|
||||
cbr_b1 $c0, @LBB0_3;
|
||||
// BB#2: // %if.then2
|
||||
shl_u64 $d0, $d0, 4;
|
||||
add_u64 $d2, $d2, $d0;
|
||||
add_u64 $d0, $d1, $d0;
|
||||
ld_v4_global_align(16)_const_u32 ($s0, $s1, $s2, $s3), [$d0];
|
||||
st_v4_global_align(16)_u32 ($s0, $s1, $s2, $s3), [$d2];
|
||||
br @LBB0_4;
|
||||
|
||||
@LBB0_3:
|
||||
// %if.else
|
||||
shl_u64 $d0, $d0, 2;
|
||||
add_u64 $d2, $d2, $d0;
|
||||
add_u64 $d0, $d1, $d0;
|
||||
ld_global_align(4)_const_u32 $s0, [$d0];
|
||||
st_global_align(4)_u32 $s0, [$d2];
|
||||
|
||||
@LBB0_4:
|
||||
// %if.end6
|
||||
ret;
|
||||
};
|
||||
*/
|
||||
|
||||
static char kVectorCopyAlignedRawKv[] = {
|
||||
127, 69, 76, 70, 2, 1, 1, 64, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 1, 0, -32, 0, 1, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 64, 0, 0, 0, 0, 0, 0,
|
||||
0, -8, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
64, 0, 56, 0, 1, 0, 64, 0, 6, 0, 5, 0, 3,
|
||||
0, 0, 96, 6, 0, 0, 0, 0, 1, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, -76, 1, 0, 0, 0, 0, 0, 0,
|
||||
-76, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
|
||||
1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 65, 0, -84, 0, -112, 0, 0, 0,
|
||||
11, 0, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 11, 0, 7, 0, 7, 0, 0, 0, 9, 0, 0,
|
||||
0, 0, 0, 0, 0, 4, 4, 4, 6, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 1, 5, 0, -64, 127, 0, -116, -65,
|
||||
0, -1, -128, -109, 0, 0, 16, 0, 0, 8, 0, -109, 0,
|
||||
0, 0, 74, 4, 7, 64, -64, -128, 2, 2, 126, 127, 0,
|
||||
-116, -65, 0, 0, -56, 125, 106, 36, -128, -66, 32, 0, -120,
|
||||
-65, 6, 7, 1, -64, 0, 7, -126, -64, 127, 0, -116, -65,
|
||||
2, -127, 0, -65, 14, 0, -124, -65, 0, 0, -62, -46, 0,
|
||||
9, 1, 0, 4, 0, 4, 74, 5, 2, 6, 126, 3, 3,
|
||||
6, 80, 0, 0, 56, -36, 2, 0, 0, 2, 6, 0, 0,
|
||||
74, 7, 2, 12, 126, 6, 3, 2, 80, 112, 0, -116, -65,
|
||||
0, 0, 120, -36, 0, 2, 0, 0, 13, 0, -126, -65, 0,
|
||||
0, -62, -46, 0, 5, 1, 0, 4, 0, 4, 74, 5, 2,
|
||||
6, 126, 3, 3, 6, 80, 0, 0, 48, -36, 2, 0, 0,
|
||||
2, 6, 0, 0, 74, 7, 2, 6, 126, 3, 3, 2, 80,
|
||||
112, 0, -116, -65, 0, 0, 112, -36, 0, 2, 0, 0, 0,
|
||||
0, -127, -65, 0, 0, 0, 0, 4, 0, 0, 0, 8, 0,
|
||||
0, 0, 1, 0, 0, 0, 65, 77, 68, 0, 1, 0, 0,
|
||||
0, 0, 0, 0, 0, 4, 0, 0, 0, 12, 0, 0, 0,
|
||||
2, 0, 0, 0, 65, 77, 68, 0, 1, 0, 0, 0, 0,
|
||||
0, 0, 0, 1, 1, 1, 0, 4, 0, 0, 0, 25, 0,
|
||||
0, 0, 5, 0, 0, 0, 65, 77, 68, 0, 22, 0, 45,
|
||||
104, 115, 97, 95, 99, 97, 108, 108, 95, 99, 111, 110, 118,
|
||||
101, 110, 116, 105, 111, 110, 61, 0, 0, 0, 0, 0, 4,
|
||||
0, 0, 0, 30, 0, 0, 0, 3, 0, 0, 0, 65, 77,
|
||||
68, 0, 4, 0, 7, 0, 7, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 65, 77, 68, 0, 65, 77, 68, 71,
|
||||
80, 85, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 8,
|
||||
0, 0, 0, 4, 0, 0, 0, 65, 77, 68, 0, -32, 101,
|
||||
-118, -12, -1, 127, 0, 0, 38, 95, 95, 99, 111, 112, 121,
|
||||
95, 98, 117, 102, 102, 101, 114, 95, 97, 108, 105, 103, 110,
|
||||
101, 100, 95, 107, 101, 114, 110, 101, 108, 0, 95, 95, 104,
|
||||
115, 97, 95, 115, 101, 99, 116, 105, 111, 110, 46, 104, 115,
|
||||
97, 116, 101, 120, 116, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 26, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
-76, 1, 0, 0, 0, 0, 0, 0, 30, 0, 0, 0, 3,
|
||||
0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 46, 104, 115, 97, 116, 101,
|
||||
120, 116, 0, 46, 110, 111, 116, 101, 0, 46, 115, 116, 114,
|
||||
116, 97, 98, 0, 46, 115, 121, 109, 116, 97, 98, 0, 46,
|
||||
115, 104, 115, 116, 114, 116, 97, 98, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 1, 0, 0, 0, 1, 0, 0, 0, 7, 0, -64, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
1, 0, 0, 0, 0, 0, 0, -76, 1, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
10, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -72, 2,
|
||||
0, 0, 0, 0, 0, 0, -88, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16,
|
||||
0, 0, 0, 3, 0, 0, 0, 32, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 96, 3, 0,
|
||||
0, 0, 0, 0, 0, 52, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 24, 0,
|
||||
0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, -104, 3, 0, 0,
|
||||
0, 0, 0, 0, 48, 0, 0, 0, 0, 0, 0, 0, 3,
|
||||
0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0,
|
||||
0, 0, 24, 0, 0, 0, 0, 0, 0, 0, 32, 0, 0,
|
||||
0, 3, 0, 0, 0, 32, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, -56, 3, 0, 0, 0,
|
||||
0, 0, 0, 42, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
};
|
||||
extern char* const kVectorCopyAlignedKvObject = &kVectorCopyAlignedRawKv[0];
|
||||
extern size_t const kVectorCopyAlignedKvObjectSize =
|
||||
sizeof(kVectorCopyAlignedRawKv);
|
||||
|
||||
#define HSA_FILL_MEMORY_KV_AKC_SIZE 352
|
||||
#define HSA_FILL_MEMORY_KV_AKC_OFFSET 256
|
||||
|
||||
/*****HSAIL code of the ISA in ::kFillMemoryRawKv.
|
||||
module &m:1:0:$full:$large:$default;
|
||||
extension "amd:gcn";
|
||||
|
||||
prog kernel &__fill_memory_kernel(
|
||||
kernarg_u64 %ptr,
|
||||
kernarg_u64 %num,
|
||||
kernarg_u32 %value)
|
||||
{
|
||||
@__fill_memory_kernel_entry:
|
||||
// BB#0: // %entry
|
||||
workitemabsid_u32 $s0, 0;
|
||||
cvt_u64_u32 $d0, $s0;
|
||||
ld_kernarg_align(8)_width(all)_u64 $d1, [%num];
|
||||
cmp_ge_b1_u64 $c0, $d0, $d1;
|
||||
cbr_b1 $c0, @LBB0_2;
|
||||
// BB#1: // %if.end
|
||||
ld_kernarg_align(8)_width(all)_u64 $d1, [%ptr];
|
||||
ld_kernarg_align(4)_width(all)_u32 $s0, [%value];
|
||||
shl_u64 $d0, $d0, 2;
|
||||
add_u64 $d0, $d1, $d0;
|
||||
st_global_align(4)_u32 $s0, [$d0];
|
||||
|
||||
@LBB0_2:
|
||||
// %return
|
||||
ret;
|
||||
};
|
||||
*/
|
||||
|
||||
static char kFillMemoryRawKv[] = {
|
||||
127, 69, 76, 70, 2, 1, 1, 64, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 1, 0, -32, 0, 1, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 64, 0, 0, 0, 0, 0, 0, 0, -104, 3,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 64, 0, 56, 0,
|
||||
1, 0, 64, 0, 6, 0, 5, 0, 3, 0, 0, 96, 6, 0,
|
||||
0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 96, 1,
|
||||
0, 0, 0, 0, 0, 0, 96, 1, 0, 0, 0, 0, 0, 0,
|
||||
0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 64, 0, -84, 0,
|
||||
-112, 0, 0, 0, 11, 0, 10, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 32, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 11, 0, 3, 0, 3, 0, 0, 0, 9, 0,
|
||||
0, 0, 0, 0, 0, 0, 4, 4, 4, 6, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 1, 5, 0, -64, 127, 0,
|
||||
-116, -65, 0, -1, -128, -109, 0, 0, 16, 0, 0, 8, 0, -109,
|
||||
0, 0, 0, 74, 2, 7, 64, -64, -128, 2, 2, 126, 127, 0,
|
||||
-116, -65, 0, 0, -56, 125, 106, 36, -128, -66, 11, 0, -120, -65,
|
||||
0, 7, 65, -64, 4, 7, 2, -64, 0, 0, -62, -46, 0, 5,
|
||||
1, 0, 127, 0, -116, -65, 2, 0, 0, 74, 3, 2, 4, 126,
|
||||
2, 3, 2, 80, 4, 2, 4, 126, 0, 0, 112, -36, 0, 2,
|
||||
0, 0, 0, 0, -127, -65, 4, 0, 0, 0, 8, 0, 0, 0,
|
||||
1, 0, 0, 0, 65, 77, 68, 0, 1, 0, 0, 0, 0, 0,
|
||||
0, 0, 4, 0, 0, 0, 12, 0, 0, 0, 2, 0, 0, 0,
|
||||
65, 77, 68, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1,
|
||||
1, 0, 4, 0, 0, 0, 25, 0, 0, 0, 5, 0, 0, 0,
|
||||
65, 77, 68, 0, 22, 0, 45, 104, 115, 97, 95, 99, 97, 108,
|
||||
108, 95, 99, 111, 110, 118, 101, 110, 116, 105, 111, 110, 61, 0,
|
||||
0, 0, 0, 0, 4, 0, 0, 0, 30, 0, 0, 0, 3, 0,
|
||||
0, 0, 65, 77, 68, 0, 4, 0, 7, 0, 7, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 65, 77, 68, 0, 65, 77,
|
||||
68, 71, 80, 85, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0,
|
||||
8, 0, 0, 0, 4, 0, 0, 0, 65, 77, 68, 0, 48, 123,
|
||||
44, -103, -4, 127, 0, 0, 38, 95, 95, 102, 105, 108, 108, 95,
|
||||
109, 101, 109, 111, 114, 121, 95, 107, 101, 114, 110, 101, 108, 0,
|
||||
95, 95, 104, 115, 97, 95, 115, 101, 99, 116, 105, 111, 110, 46,
|
||||
104, 115, 97, 116, 101, 120, 116, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 26, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
96, 1, 0, 0, 0, 0, 0, 0, 22, 0, 0, 0, 3, 0,
|
||||
1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 46, 104, 115, 97, 116, 101, 120, 116, 0,
|
||||
46, 110, 111, 116, 101, 0, 46, 115, 116, 114, 116, 97, 98, 0,
|
||||
46, 115, 121, 109, 116, 97, 98, 0, 46, 115, 104, 115, 116, 114,
|
||||
116, 97, 98, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 7, 0,
|
||||
-64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 1, 0, 0, 0, 0, 0, 0, 96, 1, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 0,
|
||||
0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 96, 2, 0, 0, 0, 0,
|
||||
0, 0, -88, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 16, 0, 0, 0, 3, 0, 0, 0,
|
||||
32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 8, 3, 0, 0, 0, 0, 0, 0, 44, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
24, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 56, 3, 0, 0,
|
||||
0, 0, 0, 0, 48, 0, 0, 0, 0, 0, 0, 0, 3, 0,
|
||||
0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0,
|
||||
24, 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 0, 3, 0,
|
||||
0, 0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 104, 3, 0, 0, 0, 0, 0, 0, 42, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0,
|
||||
};
|
||||
|
||||
extern char* const kFillMemoryKvObject = &kFillMemoryRawKv[0];
|
||||
extern size_t const kFillMemoryKvObjectSize = sizeof(kFillMemoryRawKv);
|
||||
#endif // header guard
|
||||
@@ -1,490 +0,0 @@
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// The University of Illinois/NCSA
|
||||
// Open Source License (NCSA)
|
||||
//
|
||||
// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved.
|
||||
//
|
||||
// Developed by:
|
||||
//
|
||||
// AMD Research and AMD HSA Software Development
|
||||
//
|
||||
// Advanced Micro Devices, Inc.
|
||||
//
|
||||
// www.amd.com
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to
|
||||
// deal with the Software without restriction, including without limitation
|
||||
// the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
// and/or sell copies of the Software, and to permit persons to whom the
|
||||
// Software is furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// - Redistributions of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimers.
|
||||
// - Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimers in
|
||||
// the documentation and/or other materials provided with the distribution.
|
||||
// - Neither the names of Advanced Micro Devices, Inc,
|
||||
// nor the names of its contributors may be used to endorse or promote
|
||||
// products derived from this Software without specific prior written
|
||||
// permission.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||
// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||
// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
// DEALINGS WITH THE SOFTWARE.
|
||||
//
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#ifndef HSA_RUNTIME_CORE_INC_AMD_BLIT_KERNEL_VI_H_
|
||||
#define HSA_RUNTIME_CORE_INC_AMD_BLIT_KERNEL_VI_H_
|
||||
|
||||
#include <stddef.h>
|
||||
|
||||
#define HSA_VECTOR_COPY_VI_AKC_SIZE 380
|
||||
#define HSA_VECTOR_COPY_VI_AKC_OFFSET 256
|
||||
|
||||
/*****HSAIL code of the ISA in ::kVectorCopyRawVi.
|
||||
module &m:1:0:$full:$large:$default;
|
||||
|
||||
prog kernel &__vector_copy_kernel(
|
||||
kernarg_u64 %src,
|
||||
kernarg_u64 %dst,
|
||||
kernarg_u64 %size)
|
||||
{
|
||||
@__vector_copy_kernel_entry:
|
||||
// BB#0: // %entry
|
||||
workitemabsid_u32 $s0, 0;
|
||||
cvt_u64_u32 $d0, $s0;
|
||||
ld_kernarg_align(8)_width(all)_u64 $d1, [%size];
|
||||
cmp_ge_b1_u64 $c0, $d0, $d1;
|
||||
cbr_b1 $c0, @BB0_2;
|
||||
// BB#1: // %if.end
|
||||
ld_kernarg_align(8)_width(all)_u64 $d1, [%src];
|
||||
ld_kernarg_align(8)_width(all)_u64 $d2, [%dst];
|
||||
add_u64 $d2, $d2, $d0;
|
||||
add_u64 $d0, $d1, $d0;
|
||||
ld_global_u8 $s0, [$d0];
|
||||
st_global_u8 $s0, [$d2];
|
||||
|
||||
@BB0_2:
|
||||
// %return
|
||||
ret;
|
||||
};
|
||||
*/
|
||||
|
||||
static char kVectorCopyRawVi[] = {
|
||||
127, 69, 76, 70, 2, 1, 1, 64, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 1, 0, -32, 0, 1, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 64, 0, 0, 0, 0, 0, 0,
|
||||
0, -72, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
64, 0, 56, 0, 1, 0, 64, 0, 6, 0, 5, 0, 3,
|
||||
0, 0, 96, 6, 0, 0, 0, 0, 1, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 124, 1, 0, 0, 0, 0, 0, 0,
|
||||
124, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
|
||||
1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, -63, 2, -84, 0, -112, 0, 0, 0,
|
||||
11, 0, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 96, 0, 5, 0, 5, 0, 0, 0, 9, 0, 0,
|
||||
0, 0, 0, 0, 0, 4, 4, 4, 6, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 2, 0, 2, -64, 4, 0, 0, 0,
|
||||
127, 0, -116, -65, 0, -1, -128, -110, 0, 0, 16, 0, 0,
|
||||
8, 0, -110, 0, 0, 0, 50, 3, 0, 6, -64, 16, 0,
|
||||
0, 0, -128, 2, 2, 126, 127, 0, -116, -65, 0, 0, -40,
|
||||
125, 106, 32, -128, -66, 16, 0, -120, -65, 3, 1, 10, -64,
|
||||
0, 0, 0, 0, 127, 0, -116, -65, 4, 0, 2, 50, 5,
|
||||
2, 4, 126, 2, 106, 28, -47, 2, 1, -87, 1, 0, 0,
|
||||
64, -36, 1, 0, 0, 1, 6, 0, 6, 50, 7, 2, 4,
|
||||
126, 4, 106, 28, -47, 2, 1, -87, 1, 112, 0, -116, -65,
|
||||
0, 0, 96, -36, 3, 1, 0, 0, 0, 0, -127, -65, 0,
|
||||
0, 0, 0, 4, 0, 0, 0, 8, 0, 0, 0, 1, 0,
|
||||
0, 0, 65, 77, 68, 0, 1, 0, 0, 0, 0, 0, 0,
|
||||
0, 4, 0, 0, 0, 12, 0, 0, 0, 2, 0, 0, 0,
|
||||
65, 77, 68, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
|
||||
1, 1, 0, 4, 0, 0, 0, 25, 0, 0, 0, 5, 0,
|
||||
0, 0, 65, 77, 68, 0, 22, 0, 45, 104, 115, 97, 95,
|
||||
99, 97, 108, 108, 95, 99, 111, 110, 118, 101, 110, 116, 105,
|
||||
111, 110, 61, 0, 0, 0, 0, 0, 4, 0, 0, 0, 30,
|
||||
0, 0, 0, 3, 0, 0, 0, 65, 77, 68, 0, 4, 0,
|
||||
7, 0, 8, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,
|
||||
0, 65, 77, 68, 0, 65, 77, 68, 71, 80, 85, 0, 0,
|
||||
0, 0, 0, 0, 4, 0, 0, 0, 8, 0, 0, 0, 4,
|
||||
0, 0, 0, 65, 77, 68, 0, 32, 103, -72, 81, -3, 127,
|
||||
0, 0, 38, 95, 95, 118, 101, 99, 116, 111, 114, 95, 99,
|
||||
111, 112, 121, 95, 107, 101, 114, 110, 101, 108, 0, 95, 95,
|
||||
104, 115, 97, 95, 115, 101, 99, 116, 105, 111, 110, 46, 104,
|
||||
115, 97, 116, 101, 120, 116, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 26, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 124, 1, 0, 0, 0, 0, 0, 0, 22, 0, 0, 0,
|
||||
3, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 46, 104, 115, 97, 116,
|
||||
101, 120, 116, 0, 46, 110, 111, 116, 101, 0, 46, 115, 116,
|
||||
114, 116, 97, 98, 0, 46, 115, 121, 109, 116, 97, 98, 0,
|
||||
46, 115, 104, 115, 116, 114, 116, 97, 98, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 7, 0, -64,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 1, 0, 0, 0, 0, 0, 0, 124, 1, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 10, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -128,
|
||||
2, 0, 0, 0, 0, 0, 0, -88, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
16, 0, 0, 0, 3, 0, 0, 0, 32, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 40, 3,
|
||||
0, 0, 0, 0, 0, 0, 44, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 24,
|
||||
0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 88, 3, 0,
|
||||
0, 0, 0, 0, 0, 48, 0, 0, 0, 0, 0, 0, 0,
|
||||
3, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 0,
|
||||
0, 0, 0, 24, 0, 0, 0, 0, 0, 0, 0, 32, 0,
|
||||
0, 0, 3, 0, 0, 0, 32, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, -120, 3, 0, 0,
|
||||
0, 0, 0, 0, 42, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
};
|
||||
extern char* const kVectorCopyViObject = &kVectorCopyRawVi[0];
|
||||
extern size_t const kVectorCopyViObjectSize = sizeof(kVectorCopyRawVi);
|
||||
|
||||
#define HSA_VECTOR_COPY_ALIGNED_VI_AKC_SIZE 452
|
||||
#define HSA_VECTOR_COPY_ALIGNED_VI_AKC_OFFSET 256
|
||||
|
||||
/*****HSAIL code of the ISA in ::kVectorCopyAlignedRawVi.
|
||||
module &m:1:0:$full:$large:$default;
|
||||
extension "amd:gcn";
|
||||
|
||||
prog kernel &__copy_buffer_aligned_kernel(
|
||||
kernarg_u64 %src,
|
||||
kernarg_u64 %dst,
|
||||
kernarg_u64 %size,
|
||||
kernarg_u32 %use_vector)
|
||||
{
|
||||
@__copy_buffer_aligned_kernel_entry:
|
||||
// BB#0: // %entry
|
||||
workitemabsid_u32 $s0, 0;
|
||||
cvt_u64_u32 $d0, $s0;
|
||||
ld_kernarg_align(8)_width(all)_u64 $d1, [%size];
|
||||
cmp_ge_b1_u64 $c0, $d0, $d1;
|
||||
cbr_b1 $c0, @LBB0_4;
|
||||
// BB#1: // %if.end
|
||||
ld_kernarg_align(8)_width(all)_u64 $d2, [%dst];
|
||||
ld_kernarg_align(8)_width(all)_u64 $d1, [%src];
|
||||
ld_kernarg_align(4)_width(all)_u32 $s0, [%use_vector];
|
||||
cmp_ne_b1_s32 $c0, $s0, 1;
|
||||
cbr_b1 $c0, @LBB0_3;
|
||||
// BB#2: // %if.then2
|
||||
shl_u64 $d0, $d0, 4;
|
||||
add_u64 $d2, $d2, $d0;
|
||||
add_u64 $d0, $d1, $d0;
|
||||
ld_v4_global_align(16)_const_u32 ($s0, $s1, $s2, $s3), [$d0];
|
||||
st_v4_global_align(16)_u32 ($s0, $s1, $s2, $s3), [$d2];
|
||||
br @LBB0_4;
|
||||
|
||||
@LBB0_3:
|
||||
// %if.else
|
||||
shl_u64 $d0, $d0, 2;
|
||||
add_u64 $d2, $d2, $d0;
|
||||
add_u64 $d0, $d1, $d0;
|
||||
ld_global_align(4)_const_u32 $s0, [$d0];
|
||||
st_global_align(4)_u32 $s0, [$d2];
|
||||
|
||||
@LBB0_4:
|
||||
// %if.end6
|
||||
ret;
|
||||
};
|
||||
*/
|
||||
|
||||
static char kVectorCopyAlignedRawVi[] = {
|
||||
127, 69, 76, 70, 2, 1, 1, 64, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 1, 0, -32, 0, 1, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 64, 0, 0, 0, 0, 0, 0,
|
||||
0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
64, 0, 56, 0, 1, 0, 64, 0, 6, 0, 5, 0, 3,
|
||||
0, 0, 96, 6, 0, 0, 0, 0, 1, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, -60, 1, 0, 0, 0, 0, 0, 0,
|
||||
-60, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
|
||||
1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 65, 0, -84, 0, -112, 0, 0, 0,
|
||||
11, 0, 74, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 16, 0, 8, 0, 8, 0, 0, 0, 12, 0, 0,
|
||||
0, 0, 0, 0, 0, 4, 4, 4, 6, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 2, 0, 2, -64, 4, 0, 0, 0,
|
||||
127, 0, -116, -65, 0, -1, -128, -110, 0, 0, 16, 0, 0,
|
||||
8, 0, -110, 0, 0, 0, 50, 3, 0, 6, -64, 16, 0,
|
||||
0, 0, -128, 2, 2, 126, 127, 0, -116, -65, 0, 0, -40,
|
||||
125, 106, 32, -128, -66, 34, 0, -120, -65, -125, 0, 2, -64,
|
||||
24, 0, 0, 0, 3, 2, 10, -64, 0, 0, 0, 0, 127,
|
||||
0, -116, -65, 2, -127, 0, -65, 14, 0, -124, -65, 0, 0,
|
||||
-113, -46, -124, 0, 2, 0, 8, 0, 4, 50, 9, 2, 6,
|
||||
126, 3, 3, 6, 56, 0, 0, 92, -36, 2, 0, 0, 4,
|
||||
10, 0, 0, 50, 11, 2, 4, 126, 2, 3, 2, 56, 112,
|
||||
0, -116, -65, 0, 0, 124, -36, 0, 4, 0, 0, 13, 0,
|
||||
-126, -65, 0, 0, -113, -46, -126, 0, 2, 0, 8, 0, 4,
|
||||
50, 9, 2, 6, 126, 3, 3, 6, 56, 0, 0, 80, -36,
|
||||
2, 0, 0, 4, 10, 0, 0, 50, 11, 2, 4, 126, 2,
|
||||
3, 2, 56, 112, 0, -116, -65, 0, 0, 112, -36, 0, 4,
|
||||
0, 0, 0, 0, -127, -65, 0, 0, 0, 0, 4, 0, 0,
|
||||
0, 8, 0, 0, 0, 1, 0, 0, 0, 65, 77, 68, 0,
|
||||
1, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 12,
|
||||
0, 0, 0, 2, 0, 0, 0, 65, 77, 68, 0, 1, 0,
|
||||
0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 4, 0, 0,
|
||||
0, 25, 0, 0, 0, 5, 0, 0, 0, 65, 77, 68, 0,
|
||||
22, 0, 45, 104, 115, 97, 95, 99, 97, 108, 108, 95, 99,
|
||||
111, 110, 118, 101, 110, 116, 105, 111, 110, 61, 0, 0, 0,
|
||||
0, 0, 4, 0, 0, 0, 30, 0, 0, 0, 3, 0, 0,
|
||||
0, 65, 77, 68, 0, 4, 0, 7, 0, 8, 0, 0, 0,
|
||||
0, 0, 0, 0, 1, 0, 0, 0, 65, 77, 68, 0, 65,
|
||||
77, 68, 71, 80, 85, 0, 0, 0, 0, 0, 0, 4, 0,
|
||||
0, 0, 8, 0, 0, 0, 4, 0, 0, 0, 65, 77, 68,
|
||||
0, 96, 62, -27, 85, -1, 127, 0, 0, 38, 95, 95, 99,
|
||||
111, 112, 121, 95, 98, 117, 102, 102, 101, 114, 95, 97, 108,
|
||||
105, 103, 110, 101, 100, 95, 107, 101, 114, 110, 101, 108, 0,
|
||||
95, 95, 104, 115, 97, 95, 115, 101, 99, 116, 105, 111, 110,
|
||||
46, 104, 115, 97, 116, 101, 120, 116, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 26, 0, 1, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, -60, 1, 0, 0, 0, 0, 0, 0, 30, 0,
|
||||
0, 0, 3, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 46, 104, 115,
|
||||
97, 116, 101, 120, 116, 0, 46, 110, 111, 116, 101, 0, 46,
|
||||
115, 116, 114, 116, 97, 98, 0, 46, 115, 121, 109, 116, 97,
|
||||
98, 0, 46, 115, 104, 115, 116, 114, 116, 97, 98, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 7,
|
||||
0, -64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 1, 0, 0, 0, 0, 0, 0, -60, 1, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 10, 0, 0, 0, 7, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, -56, 2, 0, 0, 0, 0, 0, 0, -88, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 16, 0, 0, 0, 3, 0, 0, 0, 32, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
112, 3, 0, 0, 0, 0, 0, 0, 52, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 24, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -88,
|
||||
3, 0, 0, 0, 0, 0, 0, 48, 0, 0, 0, 0, 0,
|
||||
0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0,
|
||||
0, 0, 0, 0, 0, 24, 0, 0, 0, 0, 0, 0, 0,
|
||||
32, 0, 0, 0, 3, 0, 0, 0, 32, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -40, 3,
|
||||
0, 0, 0, 0, 0, 0, 42, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
};
|
||||
extern char* const kVectorCopyAlignedViObject = &kVectorCopyAlignedRawVi[0];
|
||||
extern size_t const kVectorCopyAlignedViObjectSize =
|
||||
sizeof(kVectorCopyAlignedRawVi);
|
||||
|
||||
#define HSA_FILL_MEMORY_VI_AKC_SIZE 368
|
||||
#define HSA_FILL_MEMORY_VI_AKC_OFFSET 256
|
||||
|
||||
/*****HSAIL code of the ISA in ::kFillMemoryRawVi.
|
||||
module &m:1:0:$full:$large:$default;
|
||||
extension "amd:gcn";
|
||||
|
||||
prog kernel &__fill_memory_kernel(
|
||||
kernarg_u64 %ptr,
|
||||
kernarg_u64 %num,
|
||||
kernarg_u32 %value)
|
||||
{
|
||||
@__fill_memory_kernel_entry:
|
||||
// BB#0: // %entry
|
||||
workitemabsid_u32 $s0, 0;
|
||||
cvt_u64_u32 $d0, $s0;
|
||||
ld_kernarg_align(8)_width(all)_u64 $d1, [%num];
|
||||
cmp_ge_b1_u64 $c0, $d0, $d1;
|
||||
cbr_b1 $c0, @LBB0_2;
|
||||
// BB#1: // %if.end
|
||||
ld_kernarg_align(8)_width(all)_u64 $d1, [%ptr];
|
||||
ld_kernarg_align(4)_width(all)_u32 $s0, [%value];
|
||||
shl_u64 $d0, $d0, 2;
|
||||
add_u64 $d0, $d1, $d0;
|
||||
st_global_align(4)_u32 $s0, [$d0];
|
||||
|
||||
@LBB0_2:
|
||||
// %return
|
||||
ret;
|
||||
};
|
||||
*/
|
||||
|
||||
static char kFillMemoryRawVi[] = {
|
||||
127, 69, 76, 70, 2, 1, 1, 64, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 1, 0, -32, 0, 1, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 64, 0, 0, 0, 0, 0, 0,
|
||||
0, -88, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
64, 0, 56, 0, 1, 0, 64, 0, 6, 0, 5, 0, 3,
|
||||
0, 0, 96, 6, 0, 0, 0, 0, 1, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 112, 1, 0, 0, 0, 0, 0, 0,
|
||||
112, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
|
||||
1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 64, 0, -84, 0, -112, 0, 0, 0,
|
||||
11, 0, 74, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 13, 0, 3, 0, 3, 0, 0, 0, 9, 0, 0,
|
||||
0, 0, 0, 0, 0, 4, 4, 4, 6, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 2, 0, 2, -64, 4, 0, 0, 0,
|
||||
127, 0, -116, -65, 0, -1, -128, -110, 0, 0, 16, 0, 0,
|
||||
8, 0, -110, 0, 0, 0, 50, 3, 0, 6, -64, 8, 0,
|
||||
0, 0, -128, 2, 2, 126, 127, 0, -116, -65, 0, 0, -40,
|
||||
125, 106, 32, -128, -66, 13, 0, -120, -65, -125, 0, 6, -64,
|
||||
0, 0, 0, 0, 3, 1, 2, -64, 16, 0, 0, 0, 0,
|
||||
0, -113, -46, -126, 0, 2, 0, 127, 0, -116, -65, 2, 0,
|
||||
0, 50, 3, 2, 4, 126, 2, 3, 2, 56, 4, 2, 4,
|
||||
126, 0, 0, 112, -36, 0, 2, 0, 0, 0, 0, -127, -65,
|
||||
4, 0, 0, 0, 8, 0, 0, 0, 1, 0, 0, 0, 65,
|
||||
77, 68, 0, 1, 0, 0, 0, 0, 0, 0, 0, 4, 0,
|
||||
0, 0, 12, 0, 0, 0, 2, 0, 0, 0, 65, 77, 68,
|
||||
0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0,
|
||||
4, 0, 0, 0, 25, 0, 0, 0, 5, 0, 0, 0, 65,
|
||||
77, 68, 0, 22, 0, 45, 104, 115, 97, 95, 99, 97, 108,
|
||||
108, 95, 99, 111, 110, 118, 101, 110, 116, 105, 111, 110, 61,
|
||||
0, 0, 0, 0, 0, 4, 0, 0, 0, 30, 0, 0, 0,
|
||||
3, 0, 0, 0, 65, 77, 68, 0, 4, 0, 7, 0, 8,
|
||||
0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 65, 77,
|
||||
68, 0, 65, 77, 68, 71, 80, 85, 0, 0, 0, 0, 0,
|
||||
0, 4, 0, 0, 0, 8, 0, 0, 0, 4, 0, 0, 0,
|
||||
65, 77, 68, 0, 16, -20, 88, 97, -4, 127, 0, 0, 38,
|
||||
95, 95, 102, 105, 108, 108, 95, 109, 101, 109, 111, 114, 121,
|
||||
95, 107, 101, 114, 110, 101, 108, 0, 95, 95, 104, 115, 97,
|
||||
95, 115, 101, 99, 116, 105, 111, 110, 46, 104, 115, 97, 116,
|
||||
101, 120, 116, 0, 0, 0, 0, 0, 0, 0, 0, 0, 26,
|
||||
0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 112, 1,
|
||||
0, 0, 0, 0, 0, 0, 22, 0, 0, 0, 3, 0, 1,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 46, 104, 115, 97, 116, 101, 120, 116,
|
||||
0, 46, 110, 111, 116, 101, 0, 46, 115, 116, 114, 116, 97,
|
||||
98, 0, 46, 115, 121, 109, 116, 97, 98, 0, 46, 115, 104,
|
||||
115, 116, 114, 116, 97, 98, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
|
||||
0, 0, 0, 1, 0, 0, 0, 7, 0, -64, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
|
||||
0, 0, 0, 0, 0, 112, 1, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 0,
|
||||
0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 112, 2, 0, 0,
|
||||
0, 0, 0, 0, -88, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16, 0, 0,
|
||||
0, 3, 0, 0, 0, 32, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 24, 3, 0, 0, 0,
|
||||
0, 0, 0, 44, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 24, 0, 0, 0,
|
||||
2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 72, 3, 0, 0, 0, 0,
|
||||
0, 0, 48, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0,
|
||||
0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0,
|
||||
24, 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 0, 3,
|
||||
0, 0, 0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 120, 3, 0, 0, 0, 0, 0,
|
||||
0, 42, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0,
|
||||
};
|
||||
|
||||
extern char* const kFillMemoryViObject = &kFillMemoryRawVi[0];
|
||||
extern size_t const kFillMemoryViObjectSize = sizeof(kFillMemoryRawVi);
|
||||
#endif // header guard
|
||||
@@ -73,8 +73,10 @@ class BlitSdma : public core::Blit {
|
||||
///
|
||||
/// @note: The call will block until all packets have executed.
|
||||
///
|
||||
/// @param agent Agent passed to Initialize.
|
||||
///
|
||||
/// @return hsa_status_t
|
||||
virtual hsa_status_t Destroy() override;
|
||||
virtual hsa_status_t Destroy(const core::Agent& agent) override;
|
||||
|
||||
/// @brief Submit a linear copy command to the queue buffer.
|
||||
///
|
||||
|
||||
@@ -165,20 +165,24 @@ class GpuAgent : public GpuAgentInt {
|
||||
|
||||
uint16_t GetMicrocodeVersion() const;
|
||||
|
||||
// @brief Assembles SP3 shader source into executable code.
|
||||
// @brief Assembles SP3 shader source into ISA or AQL code object.
|
||||
//
|
||||
// @param [in] src_sp3 SP3 shader source text representation.
|
||||
// @param [in] func_name Name of the SP3 function to assemble.
|
||||
// @param [out] code_buf Executable code buffer.
|
||||
// @param [out] code_buf_size Size of executable code buffer in bytes.
|
||||
void AssembleShader(const char* src_sp3, const char* func_name,
|
||||
void*& code_buf, size_t& code_buf_size);
|
||||
// @param [in] assemble_target ISA or AQL assembly target.
|
||||
// @param [out] code_buf Code object buffer.
|
||||
// @param [out] code_buf_size Size of code object buffer in bytes.
|
||||
enum class AssembleTarget { ISA, AQL };
|
||||
|
||||
// @brief Frees executable code created by AssembleShader.
|
||||
void AssembleShader(const char* src_sp3, const char* func_name,
|
||||
AssembleTarget assemble_target, void*& code_buf,
|
||||
size_t& code_buf_size) const;
|
||||
|
||||
// @brief Frees code object created by AssembleShader.
|
||||
//
|
||||
// @param [in] code_buf Executable code buffer.
|
||||
// @param [in] code_buf_size Size of executable code buffer in bytes.
|
||||
void ReleaseShader(void* code_buf, size_t code_buf_size);
|
||||
// @param [in] code_buf Code object buffer.
|
||||
// @param [in] code_buf_size Size of code object buffer in bytes.
|
||||
void ReleaseShader(void* code_buf, size_t code_buf_size) const;
|
||||
|
||||
// @brief Override from core::Agent.
|
||||
hsa_status_t VisitRegion(bool include_peer,
|
||||
|
||||
@@ -66,8 +66,10 @@ class Blit {
|
||||
///
|
||||
/// @note: The call will block until all commands have executed.
|
||||
///
|
||||
/// @param agent Agent passed to Initialize.
|
||||
///
|
||||
/// @return hsa_status_t
|
||||
virtual hsa_status_t Destroy() = 0;
|
||||
virtual hsa_status_t Destroy(const core::Agent& agent) = 0;
|
||||
|
||||
/// @brief Submit a linear copy command to the the underlying compute device's
|
||||
/// control block. The call is blocking until the command execution is
|
||||
|
||||
@@ -43,40 +43,486 @@
|
||||
#include "core/inc/amd_blit_kernel.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <climits>
|
||||
#include <cmath>
|
||||
#include <cstring>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
#define NOMINMAX
|
||||
#include <windows.h>
|
||||
#else
|
||||
#include <sys/mman.h>
|
||||
#endif
|
||||
|
||||
#include "core/inc/amd_blit_kernel_kv.h"
|
||||
#include "core/inc/amd_blit_kernel_vi.h"
|
||||
#include "core/inc/amd_gpu_agent.h"
|
||||
#include "core/inc/hsa_internal.h"
|
||||
#include "core/util/utils.h"
|
||||
|
||||
namespace amd {
|
||||
const uint32_t BlitKernel::kGroupSize = 256;
|
||||
const size_t BlitKernel::kMaxCopyCount = AlignDown(UINT32_MAX, kGroupSize);
|
||||
const size_t BlitKernel::kMaxFillCount = AlignDown(UINT32_MAX, kGroupSize);
|
||||
|
||||
static const uint16_t kInvalidPacketHeader = HSA_PACKET_TYPE_INVALID;
|
||||
|
||||
static std::string kBlitKernelSource(R"(
|
||||
// Compatibility function for GFXIP 7.
|
||||
|
||||
function s_load_dword_offset(byte_offset)
|
||||
if kGFXIPVersion == 7
|
||||
return byte_offset / 4
|
||||
else
|
||||
return byte_offset
|
||||
end
|
||||
end
|
||||
|
||||
// Memory copy for all cases except:
|
||||
// (src_addr & 0x3) != (dst_addr & 0x3)
|
||||
//
|
||||
// Kernel argument buffer:
|
||||
// [DW 0, 1] Phase 1 src start address
|
||||
// [DW 2, 3] Phase 1 dst start address
|
||||
// [DW 4, 5] Phase 2 src start address
|
||||
// [DW 6, 7] Phase 2 dst start address
|
||||
// [DW 8, 9] Phase 3 src start address
|
||||
// [DW 10,11] Phase 3 dst start address
|
||||
// [DW 12,13] Phase 4 src start address
|
||||
// [DW 14,15] Phase 4 dst start address
|
||||
// [DW 16,17] Phase 4 src end address
|
||||
// [DW 18,19] Phase 4 dst end address
|
||||
// [DW 20 ] Total number of workitems
|
||||
|
||||
var kCopyAlignedVecWidth = 4
|
||||
var kCopyAlignedUnroll = 1
|
||||
|
||||
shader CopyAligned
|
||||
type(CS)
|
||||
user_sgpr_count(2)
|
||||
sgpr_count(32)
|
||||
vgpr_count(8 + (kCopyAlignedUnroll * kCopyAlignedVecWidth))
|
||||
|
||||
// Retrieve kernel arguments.
|
||||
s_load_dwordx4 s[4:7], s[0:1], s_load_dword_offset(0x0)
|
||||
s_load_dwordx4 s[8:11], s[0:1], s_load_dword_offset(0x10)
|
||||
s_load_dwordx4 s[12:15], s[0:1], s_load_dword_offset(0x20)
|
||||
s_load_dwordx4 s[16:19], s[0:1], s_load_dword_offset(0x30)
|
||||
s_load_dwordx4 s[20:23], s[0:1], s_load_dword_offset(0x40)
|
||||
s_load_dword s24, s[0:1], s_load_dword_offset(0x50)
|
||||
s_waitcnt lgkmcnt(0)
|
||||
|
||||
// Compute workitem id.
|
||||
s_lshl_b32 s2, s2, 0x6
|
||||
v_add_u32 v0, vcc, s2, v0
|
||||
|
||||
// =====================================================
|
||||
// Phase 1: Byte copy up to 0x100 destination alignment.
|
||||
// =====================================================
|
||||
|
||||
// Compute phase source address.
|
||||
v_mov_b32 v3, s5
|
||||
v_add_u32 v2, vcc, v0, s4
|
||||
v_addc_u32 v3, vcc, v3, 0x0, vcc
|
||||
|
||||
// Compute phase destination address.
|
||||
v_mov_b32 v5, s7
|
||||
v_add_u32 v4, vcc, v0, s6
|
||||
v_addc_u32 v5, vcc, v5, 0x0, vcc
|
||||
|
||||
L_COPY_ALIGNED_PHASE_1_LOOP:
|
||||
// Mask off lanes (or branch out) after phase end.
|
||||
v_cmp_lt_u64 vcc, v[2:3], s[8:9]
|
||||
s_cbranch_vccz L_COPY_ALIGNED_PHASE_1_DONE
|
||||
s_and_b64 exec, exec, vcc
|
||||
|
||||
// Load from/advance the source address.
|
||||
flat_load_ubyte v1, v[2:3]
|
||||
s_waitcnt vmcnt(0)
|
||||
v_add_u32 v2, vcc, v2, s24
|
||||
v_addc_u32 v3, vcc, v3, 0x0, vcc
|
||||
|
||||
// Write to/advance the destination address.
|
||||
flat_store_byte v[4:5], v1
|
||||
v_add_u32 v4, vcc, v4, s24
|
||||
v_addc_u32 v5, vcc, v5, 0x0, vcc
|
||||
|
||||
// Repeat until branched out.
|
||||
s_branch L_COPY_ALIGNED_PHASE_1_LOOP
|
||||
|
||||
L_COPY_ALIGNED_PHASE_1_DONE:
|
||||
// Restore EXEC mask for all lanes.
|
||||
s_mov_b64 exec, 0xFFFFFFFFFFFFFFFF
|
||||
|
||||
// ========================================================
|
||||
// Phase 2: Unrolled dword[x4] copy up to last whole block.
|
||||
// ========================================================
|
||||
|
||||
// Compute unrolled dword[x4] stride across all threads.
|
||||
if kCopyAlignedVecWidth == 4
|
||||
s_lshl_b32 s25, s24, 0x4
|
||||
else
|
||||
s_lshl_b32 s25, s24, 0x2
|
||||
end
|
||||
|
||||
// Compute phase source address.
|
||||
if kCopyAlignedVecWidth == 4
|
||||
v_lshlrev_b32 v1, 0x4, v0
|
||||
else
|
||||
v_lshlrev_b32 v1, 0x2, v0
|
||||
end
|
||||
|
||||
v_mov_b32 v3, s9
|
||||
v_add_u32 v2, vcc, v1, s8
|
||||
v_addc_u32 v3, vcc, v3, 0x0, vcc
|
||||
|
||||
// Compute phase destination address.
|
||||
v_mov_b32 v5, s11
|
||||
v_add_u32 v4, vcc, v1, s10
|
||||
v_addc_u32 v5, vcc, v5, 0x0, vcc
|
||||
|
||||
L_COPY_ALIGNED_PHASE_2_LOOP:
|
||||
// Branch out after phase end.
|
||||
v_cmp_lt_u64 vcc, v[2:3], s[12:13]
|
||||
s_cbranch_vccz L_COPY_ALIGNED_PHASE_2_DONE
|
||||
|
||||
// Load from/advance the source address.
|
||||
for var i = 0; i < kCopyAlignedUnroll; i ++
|
||||
if kCopyAlignedVecWidth == 4
|
||||
flat_load_dwordx4 v[8 + (i * 4)], v[2:3]
|
||||
else
|
||||
flat_load_dword v[8 + i], v[2:3]
|
||||
end
|
||||
|
||||
v_add_u32 v2, vcc, v2, s25
|
||||
v_addc_u32 v3, vcc, v3, 0x0, vcc
|
||||
end
|
||||
|
||||
// Write to/advance the destination address.
|
||||
s_waitcnt vmcnt(0)
|
||||
|
||||
for var i = 0; i < kCopyAlignedUnroll; i ++
|
||||
if kCopyAlignedVecWidth == 4
|
||||
flat_store_dwordx4 v[4:5], v[8 + (i * 4)]
|
||||
else
|
||||
flat_store_dword v[4:5], v[8 + i]
|
||||
end
|
||||
|
||||
v_add_u32 v4, vcc, v4, s25
|
||||
v_addc_u32 v5, vcc, v5, 0x0, vcc
|
||||
end
|
||||
|
||||
// Repeat until branched out.
|
||||
s_branch L_COPY_ALIGNED_PHASE_2_LOOP
|
||||
|
||||
L_COPY_ALIGNED_PHASE_2_DONE:
|
||||
|
||||
// ===========================================
|
||||
// Phase 3: Dword copy up to last whole dword.
|
||||
// ===========================================
|
||||
|
||||
// Compute dword stride across all threads.
|
||||
s_lshl_b32 s25, s24, 0x2
|
||||
|
||||
// Compute phase source address.
|
||||
v_lshlrev_b32 v1, 0x2, v0
|
||||
v_mov_b32 v3, s13
|
||||
v_add_u32 v2, vcc, v1, s12
|
||||
v_addc_u32 v3, vcc, v3, 0x0, vcc
|
||||
|
||||
// Compute phase destination address.
|
||||
v_mov_b32 v5, s15
|
||||
v_add_u32 v4, vcc, v1, s14
|
||||
v_addc_u32 v5, vcc, v5, 0x0, vcc
|
||||
|
||||
L_COPY_ALIGNED_PHASE_3_LOOP:
|
||||
// Mask off lanes (or branch out) after phase end.
|
||||
v_cmp_lt_u64 vcc, v[2:3], s[16:17]
|
||||
s_cbranch_vccz L_COPY_ALIGNED_PHASE_3_DONE
|
||||
s_and_b64 exec, exec, vcc
|
||||
|
||||
// Load from/advance the source address.
|
||||
flat_load_dword v1, v[2:3]
|
||||
v_add_u32 v2, vcc, v2, s25
|
||||
v_addc_u32 v3, vcc, v3, 0x0, vcc
|
||||
s_waitcnt vmcnt(0)
|
||||
|
||||
// Write to/advance the destination address.
|
||||
flat_store_dword v[4:5], v1
|
||||
v_add_u32 v4, vcc, v4, s25
|
||||
v_addc_u32 v5, vcc, v5, 0x0, vcc
|
||||
|
||||
// Repeat until branched out.
|
||||
s_branch L_COPY_ALIGNED_PHASE_3_LOOP
|
||||
|
||||
L_COPY_ALIGNED_PHASE_3_DONE:
|
||||
// Restore EXEC mask for all lanes.
|
||||
s_mov_b64 exec, 0xFFFFFFFFFFFFFFFF
|
||||
|
||||
// =============================
|
||||
// Phase 4: Byte copy up to end.
|
||||
// =============================
|
||||
|
||||
// Compute phase source address.
|
||||
v_mov_b32 v3, s17
|
||||
v_add_u32 v2, vcc, v0, s16
|
||||
v_addc_u32 v3, vcc, v3, 0x0, vcc
|
||||
|
||||
// Compute phase destination address.
|
||||
v_mov_b32 v5, s19
|
||||
v_add_u32 v4, vcc, v0, s18
|
||||
v_addc_u32 v5, vcc, v5, 0x0, vcc
|
||||
|
||||
// Mask off lanes (or branch out) after phase end.
|
||||
v_cmp_lt_u64 vcc, v[2:3], s[20:21]
|
||||
s_cbranch_vccz L_COPY_ALIGNED_PHASE_4_DONE
|
||||
s_and_b64 exec, exec, vcc
|
||||
|
||||
// Load from the source address.
|
||||
flat_load_ubyte v1, v[2:3]
|
||||
s_waitcnt vmcnt(0)
|
||||
|
||||
// Write to the destination address.
|
||||
flat_store_byte v[4:5], v1
|
||||
|
||||
L_COPY_ALIGNED_PHASE_4_DONE:
|
||||
s_endpgm
|
||||
end
|
||||
|
||||
// Memory copy for this case:
|
||||
// (src_addr & 0x3) != (dst_addr & 0x3)
|
||||
//
|
||||
// Kernel argument buffer:
|
||||
// [DW 0, 1] Phase 1 src start address
|
||||
// [DW 2, 3] Phase 1 dst start address
|
||||
// [DW 4, 5] Phase 2 src start address
|
||||
// [DW 6, 7] Phase 2 dst start address
|
||||
// [DW 8, 9] Phase 2 src end address
|
||||
// [DW 10,11] Phase 2 dst end address
|
||||
// [DW 12 ] Total number of workitems
|
||||
|
||||
var kCopyMisalignedUnroll = 4
|
||||
|
||||
shader CopyMisaligned
|
||||
type(CS)
|
||||
user_sgpr_count(2)
|
||||
sgpr_count(23)
|
||||
vgpr_count(6 + kCopyMisalignedUnroll)
|
||||
|
||||
// Retrieve kernel arguments.
|
||||
s_load_dwordx4 s[4:7], s[0:1], s_load_dword_offset(0x0)
|
||||
s_load_dwordx4 s[8:11], s[0:1], s_load_dword_offset(0x10)
|
||||
s_load_dwordx4 s[12:15], s[0:1], s_load_dword_offset(0x20)
|
||||
s_load_dword s16, s[0:1], s_load_dword_offset(0x30)
|
||||
s_waitcnt lgkmcnt(0)
|
||||
|
||||
// Compute workitem id.
|
||||
s_lshl_b32 s2, s2, 0x6
|
||||
v_add_u32 v0, vcc, s2, v0
|
||||
|
||||
// ===================================================
|
||||
// Phase 1: Unrolled byte copy up to last whole block.
|
||||
// ===================================================
|
||||
|
||||
// Compute phase source address.
|
||||
v_mov_b32 v3, s5
|
||||
v_add_u32 v2, vcc, v0, s4
|
||||
v_addc_u32 v3, vcc, v3, 0x0, vcc
|
||||
|
||||
// Compute phase destination address.
|
||||
v_mov_b32 v5, s7
|
||||
v_add_u32 v4, vcc, v0, s6
|
||||
v_addc_u32 v5, vcc, v5, 0x0, vcc
|
||||
|
||||
L_COPY_MISALIGNED_PHASE_1_LOOP:
|
||||
// Branch out after phase end.
|
||||
v_cmp_lt_u64 vcc, v[2:3], s[8:9]
|
||||
s_cbranch_vccz L_COPY_MISALIGNED_PHASE_1_DONE
|
||||
|
||||
// Load from/advance the source address.
|
||||
for var i = 0; i < kCopyMisalignedUnroll; i ++
|
||||
flat_load_ubyte v[6 + i], v[2:3]
|
||||
v_add_u32 v2, vcc, v2, s16
|
||||
v_addc_u32 v3, vcc, v3, 0x0, vcc
|
||||
end
|
||||
|
||||
// Write to/advance the destination address.
|
||||
s_waitcnt vmcnt(0)
|
||||
|
||||
for var i = 0; i < kCopyMisalignedUnroll; i ++
|
||||
flat_store_byte v[4:5], v[6 + i]
|
||||
v_add_u32 v4, vcc, v4, s16
|
||||
v_addc_u32 v5, vcc, v5, 0x0, vcc
|
||||
end
|
||||
|
||||
// Repeat until branched out.
|
||||
s_branch L_COPY_MISALIGNED_PHASE_1_LOOP
|
||||
|
||||
L_COPY_MISALIGNED_PHASE_1_DONE:
|
||||
|
||||
// =============================
|
||||
// Phase 2: Byte copy up to end.
|
||||
// =============================
|
||||
|
||||
// Compute phase source address.
|
||||
v_mov_b32 v3, s9
|
||||
v_add_u32 v2, vcc, v0, s8
|
||||
v_addc_u32 v3, vcc, v3, 0x0, vcc
|
||||
|
||||
// Compute phase destination address.
|
||||
v_mov_b32 v5, s11
|
||||
v_add_u32 v4, vcc, v0, s10
|
||||
v_addc_u32 v5, vcc, v5, 0x0, vcc
|
||||
|
||||
L_COPY_MISALIGNED_PHASE_2_LOOP:
|
||||
// Mask off lanes (or branch out) after phase end.
|
||||
v_cmp_lt_u64 vcc, v[2:3], s[12:13]
|
||||
s_cbranch_vccz L_COPY_MISALIGNED_PHASE_2_DONE
|
||||
s_and_b64 exec, exec, vcc
|
||||
|
||||
// Load from/advance the source address.
|
||||
flat_load_ubyte v1, v[2:3]
|
||||
v_add_u32 v2, vcc, v2, s16
|
||||
v_addc_u32 v3, vcc, v3, 0x0, vcc
|
||||
s_waitcnt vmcnt(0)
|
||||
|
||||
// Write to/advance the destination address.
|
||||
flat_store_byte v[4:5], v1
|
||||
v_add_u32 v4, vcc, v4, s16
|
||||
v_addc_u32 v5, vcc, v5, 0x0, vcc
|
||||
|
||||
// Repeat until branched out.
|
||||
s_branch L_COPY_MISALIGNED_PHASE_2_LOOP
|
||||
|
||||
L_COPY_MISALIGNED_PHASE_2_DONE:
|
||||
s_endpgm
|
||||
end
|
||||
|
||||
// Memory fill for dword-aligned region.
|
||||
//
|
||||
// Kernel argument buffer:
|
||||
// [DW 0, 1] Phase 1 dst start address
|
||||
// [DW 2, 3] Phase 2 dst start address
|
||||
// [DW 4, 5] Phase 2 dst end address
|
||||
// [DW 6 ] Value to fill memory with
|
||||
// [DW 7 ] Total number of workitems
|
||||
|
||||
var kFillVecWidth = 4
|
||||
var kFillUnroll = 1
|
||||
|
||||
shader Fill
|
||||
type(CS)
|
||||
user_sgpr_count(2)
|
||||
sgpr_count(19)
|
||||
vgpr_count(8)
|
||||
|
||||
// Retrieve kernel arguments.
|
||||
s_load_dwordx4 s[4:7], s[0:1], s_load_dword_offset(0x0)
|
||||
s_load_dwordx4 s[8:11], s[0:1], s_load_dword_offset(0x10)
|
||||
s_waitcnt lgkmcnt(0)
|
||||
|
||||
// Compute workitem id.
|
||||
s_lshl_b32 s2, s2, 0x6
|
||||
v_add_u32 v0, vcc, s2, v0
|
||||
|
||||
// Copy fill pattern into VGPRs.
|
||||
for var i = 0; i < kFillVecWidth; i ++
|
||||
v_mov_b32 v[4 + i], s10
|
||||
end
|
||||
|
||||
// ========================================================
|
||||
// Phase 1: Unrolled dword[x4] fill up to last whole block.
|
||||
// ========================================================
|
||||
|
||||
// Compute unrolled dword[x4] stride across all threads.
|
||||
if kFillVecWidth == 4
|
||||
s_lshl_b32 s12, s11, 0x4
|
||||
else
|
||||
s_lshl_b32 s12, s11, 0x2
|
||||
end
|
||||
|
||||
// Compute phase destination address.
|
||||
if kFillVecWidth == 4
|
||||
v_lshlrev_b32 v1, 0x4, v0
|
||||
else
|
||||
v_lshlrev_b32 v1, 0x2, v0
|
||||
end
|
||||
|
||||
v_mov_b32 v3, s5
|
||||
v_add_u32 v2, vcc, v1, s4
|
||||
v_addc_u32 v3, vcc, v3, 0x0, vcc
|
||||
|
||||
L_FILL_PHASE_1_LOOP:
|
||||
// Branch out after phase end.
|
||||
v_cmp_lt_u64 vcc, v[2:3], s[6:7]
|
||||
s_cbranch_vccz L_FILL_PHASE_1_DONE
|
||||
|
||||
// Write to/advance the destination address.
|
||||
for var i = 0; i < kFillUnroll; i ++
|
||||
if kFillVecWidth == 4
|
||||
flat_store_dwordx4 v[2:3], v[4:7]
|
||||
else
|
||||
flat_store_dword v[2:3], v4
|
||||
end
|
||||
|
||||
v_add_u32 v2, vcc, v2, s12
|
||||
v_addc_u32 v3, vcc, v3, 0x0, vcc
|
||||
end
|
||||
|
||||
// Repeat until branched out.
|
||||
s_branch L_FILL_PHASE_1_LOOP
|
||||
|
||||
L_FILL_PHASE_1_DONE:
|
||||
|
||||
// ==============================
|
||||
// Phase 2: Dword fill up to end.
|
||||
// ==============================
|
||||
|
||||
// Compute dword stride across all threads.
|
||||
s_lshl_b32 s12, s11, 0x2
|
||||
|
||||
// Compute phase destination address.
|
||||
v_lshlrev_b32 v1, 0x2, v0
|
||||
v_mov_b32 v3, s7
|
||||
v_add_u32 v2, vcc, v1, s6
|
||||
v_addc_u32 v3, vcc, v3, 0x0, vcc
|
||||
|
||||
L_FILL_PHASE_2_LOOP:
|
||||
// Mask off lanes (or branch out) after phase end.
|
||||
v_cmp_lt_u64 vcc, v[2:3], s[8:9]
|
||||
s_cbranch_vccz L_FILL_PHASE_2_DONE
|
||||
s_and_b64 exec, exec, vcc
|
||||
|
||||
// Write to/advance the destination address.
|
||||
flat_store_dword v[2:3], v4
|
||||
v_add_u32 v2, vcc, v2, s12
|
||||
v_addc_u32 v3, vcc, v3, 0x0, vcc
|
||||
|
||||
// Repeat until branched out.
|
||||
s_branch L_FILL_PHASE_2_LOOP
|
||||
|
||||
L_FILL_PHASE_2_DONE:
|
||||
s_endpgm
|
||||
end
|
||||
)");
|
||||
|
||||
// Search kernel source for variable definition and return value.
|
||||
int GetKernelSourceParam(const char* paramName) {
|
||||
std::stringstream paramDef;
|
||||
paramDef << "var " << paramName << " = ";
|
||||
|
||||
std::string::size_type paramDefLoc = kBlitKernelSource.find(paramDef.str());
|
||||
assert(paramDefLoc != std::string::npos);
|
||||
std::string::size_type paramValLoc = paramDefLoc + paramDef.str().size();
|
||||
std::string::size_type paramEndLoc =
|
||||
kBlitKernelSource.find('\n', paramDefLoc);
|
||||
assert(paramDefLoc != std::string::npos);
|
||||
|
||||
std::string paramVal(&kBlitKernelSource[paramValLoc],
|
||||
&kBlitKernelSource[paramEndLoc]);
|
||||
return std::stoi(paramVal);
|
||||
}
|
||||
|
||||
static int kCopyAlignedVecWidth = GetKernelSourceParam("kCopyAlignedVecWidth");
|
||||
static int kCopyAlignedUnroll = GetKernelSourceParam("kCopyAlignedUnroll");
|
||||
static int kCopyMisalignedUnroll = GetKernelSourceParam("kCopyMisalignedUnroll");
|
||||
static int kFillVecWidth = GetKernelSourceParam("kFillVecWidth");
|
||||
static int kFillUnroll = GetKernelSourceParam("kFillUnroll");
|
||||
|
||||
BlitKernel::BlitKernel()
|
||||
: core::Blit(),
|
||||
copy_code_handle_(0),
|
||||
fill_code_handle_(0),
|
||||
queue_(NULL),
|
||||
cached_index_(0),
|
||||
kernarg_async_(NULL),
|
||||
kernarg_async_mask_(0),
|
||||
kernarg_async_counter_(0),
|
||||
code_arg_buffer_(NULL) {
|
||||
num_cus_(0) {
|
||||
completion_signal_.handle = 0;
|
||||
}
|
||||
|
||||
@@ -96,26 +542,8 @@ hsa_status_t BlitKernel::Initialize(const core::Agent& agent) {
|
||||
return HSA_STATUS_ERROR;
|
||||
}
|
||||
|
||||
// Need queue buffer that can cover the max size of local memory.
|
||||
const uint64_t kGpuVmVaSize = 1ULL << 40;
|
||||
const uint32_t kRequiredQueueSize = NextPow2(static_cast<uint32_t>(
|
||||
std::ceil(static_cast<double>(kGpuVmVaSize) / kMaxCopyCount)));
|
||||
|
||||
uint32_t max_queue_size = 0;
|
||||
status = HSA::hsa_agent_get_info(agent_handle, HSA_AGENT_INFO_QUEUE_MAX_SIZE,
|
||||
&max_queue_size);
|
||||
|
||||
if (HSA_STATUS_SUCCESS != status) {
|
||||
return status;
|
||||
}
|
||||
|
||||
if (max_queue_size < kRequiredQueueSize) {
|
||||
return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
|
||||
}
|
||||
|
||||
status =
|
||||
HSA::hsa_queue_create(agent_handle, kRequiredQueueSize,
|
||||
HSA_QUEUE_TYPE_MULTI, NULL, NULL, 0, 0, &queue_);
|
||||
status = HSA::hsa_queue_create(agent_handle, 1024, HSA_QUEUE_TYPE_MULTI, NULL,
|
||||
NULL, 0, 0, &queue_);
|
||||
|
||||
if (HSA_STATUS_SUCCESS != status) {
|
||||
return status;
|
||||
@@ -125,86 +553,6 @@ hsa_status_t BlitKernel::Initialize(const core::Agent& agent) {
|
||||
|
||||
cached_index_ = 0;
|
||||
|
||||
void* copy_raw_obj_mem = NULL;
|
||||
size_t copy_akc_size = 0;
|
||||
size_t copy_akc_offset = 0;
|
||||
|
||||
void* copy_aligned_raw_obj_mem = NULL;
|
||||
size_t copy_aligned_akc_size = 0;
|
||||
size_t copy_aligned_akc_offset = 0;
|
||||
|
||||
void* fill_raw_obj_mem = NULL;
|
||||
size_t fill_akc_size = 0;
|
||||
size_t fill_akc_offset = 0;
|
||||
|
||||
switch (agent.isa()->GetMajorVersion()) {
|
||||
case 7:
|
||||
copy_raw_obj_mem = kVectorCopyKvObject;
|
||||
copy_akc_size = HSA_VECTOR_COPY_KV_AKC_SIZE;
|
||||
copy_akc_offset = HSA_VECTOR_COPY_KV_AKC_OFFSET;
|
||||
|
||||
copy_aligned_raw_obj_mem = kVectorCopyAlignedKvObject;
|
||||
copy_aligned_akc_size = HSA_VECTOR_COPY_ALIGNED_KV_AKC_SIZE;
|
||||
copy_aligned_akc_offset = HSA_VECTOR_COPY_ALIGNED_KV_AKC_OFFSET;
|
||||
|
||||
fill_raw_obj_mem = kFillMemoryKvObject;
|
||||
fill_akc_size = HSA_FILL_MEMORY_KV_AKC_SIZE;
|
||||
fill_akc_offset = HSA_FILL_MEMORY_KV_AKC_OFFSET;
|
||||
break;
|
||||
case 8:
|
||||
copy_raw_obj_mem = kVectorCopyViObject;
|
||||
copy_akc_size = HSA_VECTOR_COPY_VI_AKC_SIZE;
|
||||
copy_akc_offset = HSA_VECTOR_COPY_VI_AKC_OFFSET;
|
||||
|
||||
copy_aligned_raw_obj_mem = kVectorCopyAlignedViObject;
|
||||
copy_aligned_akc_size = HSA_VECTOR_COPY_ALIGNED_VI_AKC_SIZE;
|
||||
copy_aligned_akc_offset = HSA_VECTOR_COPY_ALIGNED_VI_AKC_OFFSET;
|
||||
|
||||
fill_raw_obj_mem = kFillMemoryViObject;
|
||||
fill_akc_size = HSA_FILL_MEMORY_VI_AKC_SIZE;
|
||||
fill_akc_offset = HSA_FILL_MEMORY_VI_AKC_OFFSET;
|
||||
break;
|
||||
default:
|
||||
assert(false && "Only gfx7 and gfx8 are supported");
|
||||
break;
|
||||
}
|
||||
|
||||
const size_t total_alloc_size = AlignUp(
|
||||
AlignUp(copy_akc_size, 256) + AlignUp(copy_aligned_akc_size, 256) +
|
||||
AlignUp(fill_akc_size, 256),
|
||||
4096);
|
||||
|
||||
amd_kernel_code_t *code_ptr = nullptr;
|
||||
code_arg_buffer_ = core::Runtime::runtime_singleton_->system_allocator()(
|
||||
total_alloc_size, 4096);
|
||||
|
||||
char* akc_arg = reinterpret_cast<char*>(code_arg_buffer_);
|
||||
memcpy(akc_arg,
|
||||
reinterpret_cast<const char*>(copy_raw_obj_mem) + copy_akc_offset,
|
||||
copy_akc_size);
|
||||
copy_code_handle_ = reinterpret_cast<uint64_t>(akc_arg);
|
||||
code_ptr = (amd_kernel_code_t*)(copy_code_handle_);
|
||||
code_ptr->runtime_loader_kernel_symbol = 0;
|
||||
akc_arg += copy_akc_size;
|
||||
|
||||
akc_arg = AlignUp(akc_arg, 256);
|
||||
memcpy(akc_arg, reinterpret_cast<const char*>(copy_aligned_raw_obj_mem) +
|
||||
copy_aligned_akc_offset,
|
||||
copy_aligned_akc_size);
|
||||
copy_aligned_code_handle_ = reinterpret_cast<uint64_t>(akc_arg);
|
||||
code_ptr = (amd_kernel_code_t*)(copy_aligned_code_handle_);
|
||||
code_ptr->runtime_loader_kernel_symbol = 0;
|
||||
akc_arg += copy_aligned_akc_size;
|
||||
|
||||
akc_arg = AlignUp(akc_arg, 256);
|
||||
memcpy(akc_arg,
|
||||
reinterpret_cast<const char*>(fill_raw_obj_mem) + fill_akc_offset,
|
||||
fill_akc_size);
|
||||
fill_code_handle_ = reinterpret_cast<uint64_t>(akc_arg);
|
||||
code_ptr = (amd_kernel_code_t*)(fill_code_handle_);
|
||||
code_ptr->runtime_loader_kernel_symbol = 0;
|
||||
akc_arg += fill_akc_size;
|
||||
|
||||
status = HSA::hsa_signal_create(1, 0, NULL, &completion_signal_);
|
||||
if (HSA_STATUS_SUCCESS != status) {
|
||||
return status;
|
||||
@@ -212,34 +560,40 @@ hsa_status_t BlitKernel::Initialize(const core::Agent& agent) {
|
||||
|
||||
kernarg_async_ = reinterpret_cast<KernelArgs*>(
|
||||
core::Runtime::runtime_singleton_->system_allocator()(
|
||||
kRequiredQueueSize * AlignUp(sizeof(KernelArgs), 16), 16));
|
||||
queue_->size * AlignUp(sizeof(KernelArgs), 16), 16));
|
||||
|
||||
kernarg_async_mask_ = kRequiredQueueSize - 1;
|
||||
kernarg_async_mask_ = queue_->size - 1;
|
||||
|
||||
// TODO: remove this code when execute permission level is not mandatory.
|
||||
if (((amd::GpuAgent&)agent).profile() == HSA_PROFILE_FULL) {
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
#define NOMINMAX
|
||||
DWORD old_protect = 0;
|
||||
const DWORD new_protect = PAGE_EXECUTE_READWRITE;
|
||||
if (!VirtualProtect(code_arg_buffer_, total_alloc_size, new_protect,
|
||||
&old_protect)) {
|
||||
return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
|
||||
}
|
||||
#else
|
||||
if (0 != mprotect(code_arg_buffer_, total_alloc_size,
|
||||
PROT_READ | PROT_WRITE | PROT_EXEC)) {
|
||||
return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
|
||||
}
|
||||
#endif
|
||||
// Obtain the number of compute units in the underlying agent.
|
||||
const GpuAgent& gpuAgent = static_cast<const GpuAgent&>(agent);
|
||||
num_cus_ = gpuAgent.properties().NumFComputeCores / 4;
|
||||
|
||||
// Assemble shaders to AQL code objects.
|
||||
std::map<KernelType, const char*> kernel_names = {
|
||||
{KernelType::CopyAligned, "CopyAligned"},
|
||||
{KernelType::CopyMisaligned, "CopyMisaligned"},
|
||||
{KernelType::Fill, "Fill"}};
|
||||
|
||||
for (auto kernel_name : kernel_names) {
|
||||
KernelCode& kernel = kernels_[kernel_name.first];
|
||||
gpuAgent.AssembleShader(kBlitKernelSource.c_str(), kernel_name.second,
|
||||
GpuAgent::AssembleTarget::AQL, kernel.code_buf_,
|
||||
kernel.code_buf_size_);
|
||||
}
|
||||
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
hsa_status_t BlitKernel::Destroy(void) {
|
||||
hsa_status_t BlitKernel::Destroy(const core::Agent& agent) {
|
||||
std::lock_guard<std::mutex> guard(lock_);
|
||||
|
||||
const GpuAgent& gpuAgent = static_cast<const GpuAgent&>(agent);
|
||||
|
||||
for (auto kernel_pair : kernels_) {
|
||||
gpuAgent.ReleaseShader(kernel_pair.second.code_buf_,
|
||||
kernel_pair.second.code_buf_size_);
|
||||
}
|
||||
|
||||
if (queue_ != NULL) {
|
||||
HSA::hsa_queue_destroy(queue_);
|
||||
}
|
||||
@@ -248,10 +602,6 @@ hsa_status_t BlitKernel::Destroy(void) {
|
||||
core::Runtime::runtime_singleton_->system_deallocator()(kernarg_async_);
|
||||
}
|
||||
|
||||
if (code_arg_buffer_ != NULL) {
|
||||
core::Runtime::runtime_singleton_->system_deallocator()(code_arg_buffer_);
|
||||
}
|
||||
|
||||
if (completion_signal_.handle != 0) {
|
||||
HSA::hsa_signal_destroy(completion_signal_);
|
||||
}
|
||||
@@ -259,11 +609,6 @@ hsa_status_t BlitKernel::Destroy(void) {
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
static bool IsSystemMemory(void* address) {
|
||||
static const uint64_t kLimitSystem = 1ULL << 48;
|
||||
return (reinterpret_cast<uint64_t>(address) < kLimitSystem);
|
||||
}
|
||||
|
||||
hsa_status_t BlitKernel::SubmitLinearCopyCommand(void* dst, const void* src,
|
||||
size_t size) {
|
||||
// Protect completion_signal_.
|
||||
@@ -294,35 +639,14 @@ hsa_status_t BlitKernel::SubmitLinearCopyCommand(void* dst, const void* src,
|
||||
hsa_status_t BlitKernel::SubmitLinearCopyCommand(
|
||||
void* dst, const void* src, size_t size,
|
||||
std::vector<core::Signal*>& dep_signals, core::Signal& out_signal) {
|
||||
assert(copy_code_handle_ != 0);
|
||||
|
||||
const size_t kAlignmentChar = 1;
|
||||
const size_t kAlignmentUin32 = 4;
|
||||
const size_t kAlignmentVec4 = 16;
|
||||
const size_t copy_granule =
|
||||
(IsMultipleOf(dst, kAlignmentVec4) && IsMultipleOf(src, kAlignmentVec4) &&
|
||||
IsMultipleOf(size, kAlignmentVec4))
|
||||
? kAlignmentVec4
|
||||
: (IsMultipleOf(dst, kAlignmentUin32) &&
|
||||
IsMultipleOf(src, kAlignmentUin32) &&
|
||||
IsMultipleOf(size, kAlignmentUin32))
|
||||
? kAlignmentUin32
|
||||
: kAlignmentChar;
|
||||
|
||||
size = size / copy_granule;
|
||||
|
||||
const uint32_t num_copy_packet = static_cast<uint32_t>(
|
||||
std::ceil(static_cast<double>(size) / kMaxCopyCount));
|
||||
|
||||
const uint32_t num_barrier_packet =
|
||||
static_cast<uint32_t>(std::ceil(dep_signals.size() / 5.0f));
|
||||
|
||||
// Reserve write index for copy + fence packet.
|
||||
const uint32_t total_num_packet = num_barrier_packet + num_copy_packet;
|
||||
// Reserve write index for barrier(s) + dispatch packet.
|
||||
const uint32_t num_barrier_packet = uint32_t((dep_signals.size() + 4) / 5);
|
||||
const uint32_t total_num_packet = num_barrier_packet + 1;
|
||||
|
||||
uint64_t write_index = AcquireWriteIndex(total_num_packet);
|
||||
uint64_t write_index_temp = write_index;
|
||||
|
||||
// Insert barrier packets to handle dependent signals.
|
||||
const uint16_t kBarrierPacketHeader =
|
||||
(HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE) |
|
||||
(1 << HSA_PACKET_HEADER_BARRIER) |
|
||||
@@ -352,99 +676,116 @@ hsa_status_t BlitKernel::SubmitLinearCopyCommand(
|
||||
}
|
||||
}
|
||||
|
||||
const uint32_t last_copy_index = num_copy_packet - 1;
|
||||
size_t total_copy_count = 0;
|
||||
for (uint32_t i = 0; i < num_copy_packet; ++i) {
|
||||
// Setup arguments.
|
||||
const uint32_t copy_count = static_cast<uint32_t>(
|
||||
std::min((size - total_copy_count), kMaxCopyCount));
|
||||
// Insert dispatch packet for copy kernel.
|
||||
KernelArgs* args = ObtainAsyncKernelCopyArg();
|
||||
KernelCode* kernel_code = nullptr;
|
||||
int num_workitems = 0;
|
||||
|
||||
void* cur_dst = static_cast<char*>(dst) + (total_copy_count * copy_granule);
|
||||
const void* cur_src =
|
||||
static_cast<const char*>(src) + (total_copy_count * copy_granule);
|
||||
bool aligned = ((uintptr_t(src) & 0x3) == (uintptr_t(dst) & 0x3));
|
||||
|
||||
KernelArgs* args = ObtainAsyncKernelCopyArg();
|
||||
assert(args != NULL);
|
||||
assert(IsMultipleOf(&args->copy, 16));
|
||||
if (aligned) {
|
||||
// Use dword-based aligned kernel.
|
||||
kernel_code = &kernels_[KernelType::CopyAligned];
|
||||
|
||||
args->copy.src = cur_src;
|
||||
args->copy.dst = cur_dst;
|
||||
args->copy.size = copy_count;
|
||||
args->copy.use_vector = (copy_granule == kAlignmentVec4) ? 1 : 0;
|
||||
// Compute the size of each copy phase.
|
||||
num_workitems = 64 * 4 * num_cus_;
|
||||
|
||||
const uint32_t grid_size_x =
|
||||
AlignUp(static_cast<uint32_t>(copy_count), kGroupSize);
|
||||
// Phase 1 (byte copy) ends when destination is 0x100-aligned.
|
||||
uintptr_t src_start = uintptr_t(src);
|
||||
uintptr_t dst_start = uintptr_t(dst);
|
||||
uint64_t phase1_size =
|
||||
std::min(size, uint64_t(0x100 - (dst_start & 0xFF)) & 0xFF);
|
||||
|
||||
// This assert to make sure kMaxCopySize is not changed to a number that
|
||||
// could cause overflow to packet.grid_size_x.
|
||||
assert(grid_size_x >= copy_count);
|
||||
// Phase 2 (unrolled dwordx4 copy) ends when last whole block fits.
|
||||
uint64_t phase2_block = num_workitems * sizeof(uint32_t) *
|
||||
kCopyAlignedUnroll * kCopyAlignedVecWidth;
|
||||
uint64_t phase2_size = ((size - phase1_size) / phase2_block) * phase2_block;
|
||||
|
||||
hsa_signal_t signal = {(i == last_copy_index)
|
||||
? (core::Signal::Convert(&out_signal)).handle
|
||||
: 0};
|
||||
PopulateQueue(write_index, ((copy_granule == kAlignmentChar)
|
||||
? copy_code_handle_
|
||||
: copy_aligned_code_handle_),
|
||||
args, grid_size_x, signal);
|
||||
// Phase 3 (dword copy) ends when last whole dword fits.
|
||||
uint64_t phase3_size =
|
||||
((size - phase1_size - phase2_size) / sizeof(uint32_t)) *
|
||||
sizeof(uint32_t);
|
||||
|
||||
++write_index;
|
||||
args->copy_aligned.phase1_src_start = src_start;
|
||||
args->copy_aligned.phase1_dst_start = dst_start;
|
||||
args->copy_aligned.phase2_src_start = src_start + phase1_size;
|
||||
args->copy_aligned.phase2_dst_start = dst_start + phase1_size;
|
||||
args->copy_aligned.phase3_src_start = src_start + phase1_size + phase2_size;
|
||||
args->copy_aligned.phase3_dst_start = dst_start + phase1_size + phase2_size;
|
||||
args->copy_aligned.phase4_src_start =
|
||||
src_start + phase1_size + phase2_size + phase3_size;
|
||||
args->copy_aligned.phase4_dst_start =
|
||||
dst_start + phase1_size + phase2_size + phase3_size;
|
||||
args->copy_aligned.phase4_src_end = src_start + size;
|
||||
args->copy_aligned.phase4_dst_end = dst_start + size;
|
||||
args->copy_aligned.num_workitems = num_workitems;
|
||||
} else {
|
||||
// Use byte-based misaligned kernel.
|
||||
kernel_code = &kernels_[KernelType::CopyMisaligned];
|
||||
|
||||
total_copy_count += copy_count;
|
||||
// Compute the size of each copy phase.
|
||||
num_workitems = 64 * 4 * num_cus_;
|
||||
|
||||
// Phase 1 (unrolled byte copy) ends when last whole block fits.
|
||||
uintptr_t src_start = uintptr_t(src);
|
||||
uintptr_t dst_start = uintptr_t(dst);
|
||||
uint64_t phase1_block =
|
||||
num_workitems * sizeof(uint8_t) * kCopyMisalignedUnroll;
|
||||
uint64_t phase1_size = (size / phase1_block) * phase1_block;
|
||||
|
||||
args->copy_misaligned.phase1_src_start = src_start;
|
||||
args->copy_misaligned.phase1_dst_start = dst_start;
|
||||
args->copy_misaligned.phase2_src_start = src_start + phase1_size;
|
||||
args->copy_misaligned.phase2_dst_start = dst_start + phase1_size;
|
||||
args->copy_misaligned.phase2_src_end = src_start + size;
|
||||
args->copy_misaligned.phase2_dst_end = dst_start + size;
|
||||
args->copy_misaligned.num_workitems = num_workitems;
|
||||
}
|
||||
|
||||
// Launch copy packet.
|
||||
hsa_signal_t signal = {(core::Signal::Convert(&out_signal)).handle};
|
||||
PopulateQueue(write_index, uintptr_t(kernel_code->code_buf_), args,
|
||||
num_workitems, signal);
|
||||
|
||||
// Submit barrier(s) and dispatch packets.
|
||||
ReleaseWriteIndex(write_index_temp, total_num_packet);
|
||||
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
hsa_status_t BlitKernel::SubmitLinearFillCommand(void* ptr, uint32_t value,
|
||||
size_t num) {
|
||||
assert(fill_code_handle_ != 0);
|
||||
|
||||
size_t count) {
|
||||
std::lock_guard<std::mutex> guard(lock_);
|
||||
|
||||
HSA::hsa_signal_store_relaxed(completion_signal_, 1);
|
||||
|
||||
const uint32_t num_fill_packet = static_cast<uint32_t>(
|
||||
std::ceil(static_cast<double>(num) / kMaxFillCount));
|
||||
|
||||
// Reserve write index for copy + fence packet.
|
||||
uint64_t write_index = AcquireWriteIndex(num_fill_packet);
|
||||
|
||||
const uint32_t last_fill_index = num_fill_packet - 1;
|
||||
size_t total_fill_count = 0;
|
||||
for (uint32_t i = 0; i < num_fill_packet; ++i) {
|
||||
// Setup arguments.
|
||||
const uint32_t fill_count = static_cast<uint32_t>(
|
||||
std::min((num - total_fill_count), kMaxFillCount));
|
||||
void* cur_ptr = static_cast<char*>(ptr) + total_fill_count;
|
||||
|
||||
KernelArgs* args = ObtainAsyncKernelCopyArg();
|
||||
assert(args != NULL);
|
||||
assert(IsMultipleOf(&args->fill, 16));
|
||||
|
||||
args->fill.ptr = cur_ptr;
|
||||
args->fill.num = fill_count;
|
||||
args->fill.value = value;
|
||||
|
||||
const uint32_t grid_size_x =
|
||||
AlignUp(static_cast<uint32_t>(fill_count), kGroupSize);
|
||||
|
||||
// This assert to make sure kMaxFillCount is not changed to a number that
|
||||
// could cause overflow to packet.grid_size_x.
|
||||
assert(grid_size_x >= fill_count);
|
||||
|
||||
hsa_signal_t signal = {(i == last_fill_index) ? completion_signal_.handle
|
||||
: 0};
|
||||
PopulateQueue(write_index + i, fill_code_handle_, &args[i], grid_size_x,
|
||||
signal);
|
||||
|
||||
total_fill_count += fill_count;
|
||||
// Reject misaligned base address.
|
||||
if ((uintptr_t(ptr) & 0x3) != 0) {
|
||||
return HSA_STATUS_ERROR;
|
||||
}
|
||||
|
||||
// Launch fill packet.
|
||||
ReleaseWriteIndex(write_index, num_fill_packet);
|
||||
// Compute the size of each fill phase.
|
||||
int num_workitems = 64 * num_cus_;
|
||||
|
||||
// Phase 1 (unrolled dwordx4 copy) ends when last whole block fits.
|
||||
uintptr_t dst_start = uintptr_t(ptr);
|
||||
uint64_t fill_size = count * sizeof(uint32_t);
|
||||
|
||||
uint64_t phase1_block =
|
||||
num_workitems * sizeof(uint32_t) * kFillUnroll * kFillVecWidth;
|
||||
uint64_t phase1_size = (fill_size / phase1_block) * phase1_block;
|
||||
|
||||
KernelArgs* args = ObtainAsyncKernelCopyArg();
|
||||
args->fill.phase1_dst_start = dst_start;
|
||||
args->fill.phase2_dst_start = dst_start + phase1_size;
|
||||
args->fill.phase2_dst_end = dst_start + fill_size;
|
||||
args->fill.fill_value = value;
|
||||
args->fill.num_workitems = num_workitems;
|
||||
|
||||
// Submit dispatch packet.
|
||||
HSA::hsa_signal_store_relaxed(completion_signal_, 1);
|
||||
|
||||
uint64_t write_index = AcquireWriteIndex(1);
|
||||
PopulateQueue(write_index, uintptr_t(kernels_[KernelType::Fill].code_buf_),
|
||||
args, num_workitems, completion_signal_);
|
||||
ReleaseWriteIndex(write_index, 1);
|
||||
|
||||
// Wait for the packet to finish.
|
||||
if (HSA::hsa_signal_wait_acquire(completion_signal_, HSA_SIGNAL_CONDITION_LT,
|
||||
@@ -556,9 +897,9 @@ void BlitKernel::PopulateQueue(uint64_t index, uint64_t code_handle, void* args,
|
||||
// Setup working size.
|
||||
const int kNumDimension = 1;
|
||||
packet.setup = kNumDimension << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
|
||||
packet.grid_size_x = AlignUp(static_cast<uint32_t>(grid_size_x), kGroupSize);
|
||||
packet.grid_size_x = AlignUp(static_cast<uint32_t>(grid_size_x), 64);
|
||||
packet.grid_size_y = packet.grid_size_z = 1;
|
||||
packet.workgroup_size_x = kGroupSize;
|
||||
packet.workgroup_size_x = 64;
|
||||
packet.workgroup_size_y = packet.workgroup_size_z = 1;
|
||||
|
||||
packet.completion_signal = completion_signal;
|
||||
|
||||
@@ -404,7 +404,7 @@ hsa_status_t BlitSdma::Initialize(const core::Agent& agent) {
|
||||
|
||||
if (err != HSAKMT_STATUS_SUCCESS) {
|
||||
assert(false && "AQL queue memory map failure.");
|
||||
Destroy();
|
||||
Destroy(agent);
|
||||
return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
|
||||
}
|
||||
|
||||
@@ -419,7 +419,7 @@ hsa_status_t BlitSdma::Initialize(const core::Agent& agent) {
|
||||
hsaKmtCreateQueue(gpu_agent.node_id(), kQueueType_, 100,
|
||||
HSA_QUEUE_PRIORITY_MAXIMUM, queue_start_addr_,
|
||||
queue_size_, NULL, &queue_resource_)) {
|
||||
Destroy();
|
||||
Destroy(agent);
|
||||
return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
|
||||
}
|
||||
|
||||
@@ -436,14 +436,14 @@ hsa_status_t BlitSdma::Initialize(const core::Agent& agent) {
|
||||
fence_pool_size_ * sizeof(uint32_t), 256));
|
||||
|
||||
if (fence_base_addr_ == NULL) {
|
||||
Destroy();
|
||||
Destroy(agent);
|
||||
return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
|
||||
}
|
||||
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
hsa_status_t BlitSdma::Destroy(void) {
|
||||
hsa_status_t BlitSdma::Destroy(const core::Agent& agent) {
|
||||
// Release all allocated resources and reset them to zero.
|
||||
|
||||
if (queue_resource_.QueueId != 0) {
|
||||
|
||||
@@ -46,6 +46,7 @@
|
||||
#include <atomic>
|
||||
#include <cstring>
|
||||
#include <climits>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "core/inc/amd_aql_queue.h"
|
||||
@@ -126,7 +127,7 @@ GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props)
|
||||
|
||||
GpuAgent::~GpuAgent() {
|
||||
if (blit_h2d_ != NULL) {
|
||||
hsa_status_t status = blit_h2d_->Destroy();
|
||||
hsa_status_t status = blit_h2d_->Destroy(*this);
|
||||
assert(status == HSA_STATUS_SUCCESS);
|
||||
|
||||
delete blit_h2d_;
|
||||
@@ -134,7 +135,7 @@ GpuAgent::~GpuAgent() {
|
||||
}
|
||||
|
||||
if (blit_d2h_ != NULL) {
|
||||
hsa_status_t status = blit_d2h_->Destroy();
|
||||
hsa_status_t status = blit_d2h_->Destroy(*this);
|
||||
assert(status == HSA_STATUS_SUCCESS);
|
||||
|
||||
delete blit_d2h_;
|
||||
@@ -158,33 +159,51 @@ GpuAgent::~GpuAgent() {
|
||||
}
|
||||
|
||||
void GpuAgent::AssembleShader(const char* src_sp3, const char* func_name,
|
||||
void*& code_buf, size_t& code_buf_size) {
|
||||
AssembleTarget assemble_target, void*& code_buf,
|
||||
size_t& code_buf_size) const {
|
||||
#ifdef __linux__ // No VS builds of libsp3 available right now
|
||||
std::string src_sp3_unified(src_sp3);
|
||||
|
||||
if (isa_->GetMajorVersion() == 7) {
|
||||
// On Gfx7 replace v_add_u32 with legacy equivalent v_add_i32.
|
||||
std::string add_inst_gfx8("v_add_u32"), add_inst_gfx7("v_add_i32");
|
||||
|
||||
for (size_t instIdx = 0; (instIdx = src_sp3_unified.find(
|
||||
add_inst_gfx8, instIdx)) != std::string::npos;
|
||||
instIdx += add_inst_gfx8.size()) {
|
||||
src_sp3_unified.replace(instIdx, add_inst_gfx7.size(), add_inst_gfx7);
|
||||
}
|
||||
}
|
||||
|
||||
// Assemble source string with libsp3.
|
||||
sp3_context* sp3 = sp3_new();
|
||||
|
||||
switch (isa_->GetMajorVersion()) {
|
||||
case 7:
|
||||
sp3_setasic(sp3, "CI");
|
||||
sp3_set_param_int(sp3, "kGFXIPVersion", 7);
|
||||
break;
|
||||
case 8:
|
||||
sp3_setasic(sp3, "VI");
|
||||
sp3_set_param_int(sp3, "kGFXIPVersion", 8);
|
||||
break;
|
||||
default:
|
||||
assert(false && "SP3 assembly not supported on this agent");
|
||||
}
|
||||
|
||||
sp3_parse_string(sp3, src_sp3);
|
||||
sp3_parse_string(sp3, src_sp3_unified.c_str());
|
||||
sp3_shader* code_sp3_meta = sp3_compile(sp3, func_name);
|
||||
|
||||
// Allocate a GPU-visible buffer for the trap shader.
|
||||
// Allocate a GPU-visible buffer for the shader.
|
||||
HsaMemFlags code_buf_flags = {0};
|
||||
code_buf_flags.ui32.HostAccess = 1;
|
||||
code_buf_flags.ui32.ExecuteAccess = 1;
|
||||
code_buf_flags.ui32.NoSubstitute = 1;
|
||||
|
||||
size_t code_size = code_sp3_meta->size * sizeof(uint32_t);
|
||||
code_buf_size = AlignUp(code_size, 0x1000);
|
||||
size_t header_size =
|
||||
(assemble_target == AssembleTarget::AQL ? sizeof(amd_kernel_code_t) : 0);
|
||||
code_buf_size = AlignUp(header_size + code_size, 0x1000);
|
||||
|
||||
HSAKMT_STATUS err =
|
||||
hsaKmtAllocMemory(node_id(), code_buf_size, code_buf_flags, &code_buf);
|
||||
@@ -193,9 +212,38 @@ void GpuAgent::AssembleShader(const char* src_sp3, const char* func_name,
|
||||
err = hsaKmtMapMemoryToGPU(code_buf, code_buf_size, NULL);
|
||||
assert(err == HSAKMT_STATUS_SUCCESS && "hsaKmtMapMemoryToGPU(Trap) failed");
|
||||
|
||||
// Copy trap handler code into the GPU-visible buffer.
|
||||
memset(code_buf, 0, code_buf_size);
|
||||
memcpy(code_buf, code_sp3_meta->data, code_size);
|
||||
|
||||
// Populate optional code object header.
|
||||
if (assemble_target == AssembleTarget::AQL) {
|
||||
amd_kernel_code_t* header = reinterpret_cast<amd_kernel_code_t*>(code_buf);
|
||||
|
||||
int gran_sgprs = std::max(0, (int(code_sp3_meta->nsgprs) - 1) / 8);
|
||||
int gran_vgprs = std::max(0, (int(code_sp3_meta->nvgprs) - 1) / 4);
|
||||
|
||||
header->kernel_code_entry_byte_offset = sizeof(amd_kernel_code_t);
|
||||
AMD_HSA_BITS_SET(header->kernel_code_properties,
|
||||
AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_KERNARG_SEGMENT_PTR,
|
||||
1);
|
||||
AMD_HSA_BITS_SET(header->compute_pgm_rsrc1,
|
||||
AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WAVEFRONT_SGPR_COUNT,
|
||||
gran_sgprs);
|
||||
AMD_HSA_BITS_SET(header->compute_pgm_rsrc1,
|
||||
AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WORKITEM_VGPR_COUNT,
|
||||
gran_vgprs);
|
||||
AMD_HSA_BITS_SET(header->compute_pgm_rsrc1,
|
||||
AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_DENORM_MODE_16_64, 3);
|
||||
AMD_HSA_BITS_SET(header->compute_pgm_rsrc1,
|
||||
AMD_COMPUTE_PGM_RSRC_ONE_ENABLE_IEEE_MODE, 1);
|
||||
AMD_HSA_BITS_SET(header->compute_pgm_rsrc2,
|
||||
AMD_COMPUTE_PGM_RSRC_TWO_USER_SGPR_COUNT, 2);
|
||||
AMD_HSA_BITS_SET(header->compute_pgm_rsrc2,
|
||||
AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_X, 1);
|
||||
}
|
||||
|
||||
// Copy trap handler code into the GPU-visible buffer.
|
||||
memcpy((void*)(uintptr_t(code_buf) + header_size), code_sp3_meta->data,
|
||||
code_size);
|
||||
|
||||
// Release SP3 resources.
|
||||
sp3_free_shader(code_sp3_meta);
|
||||
@@ -203,7 +251,7 @@ void GpuAgent::AssembleShader(const char* src_sp3, const char* func_name,
|
||||
#endif
|
||||
}
|
||||
|
||||
void GpuAgent::ReleaseShader(void* code_buf, size_t code_buf_size) {
|
||||
void GpuAgent::ReleaseShader(void* code_buf, size_t code_buf_size) const {
|
||||
hsaKmtUnmapMemoryToGPU(code_buf);
|
||||
hsaKmtFreeMemory(code_buf, code_buf_size);
|
||||
}
|
||||
@@ -377,7 +425,7 @@ core::Blit* GpuAgent::CreateBlitSdma() {
|
||||
BlitSdma* sdma = new BlitSdma();
|
||||
|
||||
if (sdma->Initialize(*this) != HSA_STATUS_SUCCESS) {
|
||||
sdma->Destroy();
|
||||
sdma->Destroy(*this);
|
||||
delete sdma;
|
||||
sdma = NULL;
|
||||
}
|
||||
@@ -389,7 +437,7 @@ core::Blit* GpuAgent::CreateBlitKernel() {
|
||||
BlitKernel* kernl = new BlitKernel();
|
||||
|
||||
if (kernl->Initialize(*this) != HSA_STATUS_SUCCESS) {
|
||||
kernl->Destroy();
|
||||
kernl->Destroy(*this);
|
||||
delete kernl;
|
||||
kernl = NULL;
|
||||
}
|
||||
@@ -904,7 +952,8 @@ void GpuAgent::BindTrapHandler() {
|
||||
}
|
||||
|
||||
// Assemble the trap handler source code.
|
||||
AssembleShader(src_sp3, "TrapHandler", trap_code_buf_, trap_code_buf_size_);
|
||||
AssembleShader(src_sp3, "TrapHandler", AssembleTarget::ISA, trap_code_buf_,
|
||||
trap_code_buf_size_);
|
||||
|
||||
// Bind the trap handler to this node.
|
||||
HSAKMT_STATUS err = hsaKmtSetTrapHandler(node_id(), trap_code_buf_,
|
||||
|
||||
Ссылка в новой задаче
Block a user