2016-01-22 18:18:55 -05:00
|
|
|
//
|
|
|
|
|
// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
|
|
|
|
|
//
|
|
|
|
|
#include "device/pal/palkernel.hpp"
|
|
|
|
|
#include "device/pal/palprogram.hpp"
|
|
|
|
|
#include "device/pal/palblit.hpp"
|
|
|
|
|
#include "device/pal/palconstbuf.hpp"
|
|
|
|
|
#include "device/pal/palsched.hpp"
|
|
|
|
|
#include "platform/commandqueue.hpp"
|
|
|
|
|
#include "utils/options.hpp"
|
|
|
|
|
#include "acl.h"
|
|
|
|
|
|
2018-12-10 15:49:24 -05:00
|
|
|
#if defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY)
|
2018-10-02 11:52:23 -04:00
|
|
|
#include "llvm/Support/AMDGPUMetadata.h"
|
|
|
|
|
|
|
|
|
|
typedef llvm::AMDGPU::HSAMD::Kernel::Metadata KernelMD;
|
2018-12-10 15:49:24 -05:00
|
|
|
#endif // defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY)
|
2018-10-02 11:52:23 -04:00
|
|
|
|
2016-01-22 18:18:55 -05:00
|
|
|
#include <string>
|
|
|
|
|
#include <memory>
|
|
|
|
|
#include <fstream>
|
|
|
|
|
#include <sstream>
|
|
|
|
|
#include <iostream>
|
|
|
|
|
#include <ctime>
|
|
|
|
|
#include <algorithm>
|
|
|
|
|
|
|
|
|
|
namespace pal {
|
|
|
|
|
|
2019-04-09 23:24:10 -04:00
|
|
|
void HSAILKernel::setWorkGroupInfo(const uint32_t privateSegmentSize,
|
2019-05-08 19:22:02 -04:00
|
|
|
const uint32_t groupSegmentSize, const uint16_t numSGPRs,
|
2019-04-09 23:24:10 -04:00
|
|
|
const uint16_t numVGPRs) {
|
2019-06-14 18:36:26 -04:00
|
|
|
workGroupInfo_.scratchRegs_ = amd::alignUp(privateSegmentSize, 16) / sizeof(uint32_t);
|
|
|
|
|
// Make sure runtime matches HW alignment, which is 256 scratch regs (DWORDs) per wave
|
|
|
|
|
constexpr uint32_t ScratchRegAlignment = 256;
|
|
|
|
|
workGroupInfo_.scratchRegs_ =
|
|
|
|
|
amd::alignUp((workGroupInfo_.scratchRegs_ * dev().info().wavefrontWidth_),
|
|
|
|
|
ScratchRegAlignment) / dev().info().wavefrontWidth_;
|
|
|
|
|
workGroupInfo_.privateMemSize_ = workGroupInfo_.scratchRegs_ * sizeof(uint32_t);
|
2019-04-09 23:24:10 -04:00
|
|
|
workGroupInfo_.localMemSize_ = workGroupInfo_.usedLDSSize_ = groupSegmentSize;
|
|
|
|
|
workGroupInfo_.usedSGPRs_ = numSGPRs;
|
2018-08-27 14:44:08 -04:00
|
|
|
workGroupInfo_.usedStackSize_ = 0;
|
2019-04-09 23:24:10 -04:00
|
|
|
workGroupInfo_.usedVGPRs_ = numVGPRs;
|
2018-08-27 14:44:08 -04:00
|
|
|
|
|
|
|
|
if (!prog().isNull()) {
|
|
|
|
|
workGroupInfo_.availableLDSSize_ = dev().properties().gfxipProperties.shaderCore.ldsSizePerCu;
|
|
|
|
|
workGroupInfo_.availableSGPRs_ =
|
|
|
|
|
dev().properties().gfxipProperties.shaderCore.numAvailableSgprs;
|
|
|
|
|
workGroupInfo_.availableVGPRs_ =
|
|
|
|
|
dev().properties().gfxipProperties.shaderCore.numAvailableVgprs;
|
|
|
|
|
workGroupInfo_.preferredSizeMultiple_ = workGroupInfo_.wavefrontPerSIMD_ =
|
|
|
|
|
dev().info().wavefrontWidth_;
|
|
|
|
|
} else {
|
|
|
|
|
workGroupInfo_.availableLDSSize_ = 64 * Ki;
|
|
|
|
|
workGroupInfo_.availableSGPRs_ = 104;
|
|
|
|
|
workGroupInfo_.availableVGPRs_ = 256;
|
|
|
|
|
workGroupInfo_.preferredSizeMultiple_ = workGroupInfo_.wavefrontPerSIMD_ = 64;
|
2017-04-13 13:56:38 -04:00
|
|
|
}
|
2019-04-09 23:24:10 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool HSAILKernel::setKernelCode(amd::hsa::loader::Symbol* sym, amd_kernel_code_t* akc) {
|
|
|
|
|
if (!sym) {
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
if (!sym->GetInfo(HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, reinterpret_cast<void*>(&code_))) {
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Copy code object of this kernel from the program CPU segment
|
2019-05-08 19:22:02 -04:00
|
|
|
memcpy(akc, reinterpret_cast<void*>(prog().findHostKernelAddress(code_)),
|
|
|
|
|
sizeof(amd_kernel_code_t));
|
2019-04-09 23:24:10 -04:00
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool HSAILKernel::aqlCreateHWInfo(amd::hsa::loader::Symbol* sym) {
|
|
|
|
|
amd_kernel_code_t* akc = &akc_;
|
|
|
|
|
|
|
|
|
|
if (!setKernelCode(sym, akc)) {
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (!sym->GetInfo(HSA_EXT_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT_SIZE,
|
2019-05-08 19:22:02 -04:00
|
|
|
reinterpret_cast<void*>(&codeSize_))) {
|
2019-04-09 23:24:10 -04:00
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
2019-05-08 19:22:02 -04:00
|
|
|
// Setup the the workgroup info
|
|
|
|
|
setWorkGroupInfo(akc->workitem_private_segment_byte_size, akc->workgroup_group_segment_byte_size,
|
|
|
|
|
akc->wavefront_sgpr_count, akc->workitem_vgpr_count);
|
2019-04-09 23:24:10 -04:00
|
|
|
|
|
|
|
|
workgroupGroupSegmentByteSize_ = workGroupInfo_.usedLDSSize_;
|
2019-05-08 19:22:02 -04:00
|
|
|
kernargSegmentByteSize_ = akc->kernarg_segment_byte_size;
|
2019-04-09 23:24:10 -04:00
|
|
|
|
2018-08-27 14:44:08 -04:00
|
|
|
return true;
|
2016-01-22 18:18:55 -05:00
|
|
|
}
|
|
|
|
|
|
2017-04-13 13:56:38 -04:00
|
|
|
HSAILKernel::HSAILKernel(std::string name, HSAILProgram* prog, std::string compileOptions)
|
2019-04-09 23:24:10 -04:00
|
|
|
: device::Kernel(prog->dev(), name, *prog),
|
2017-04-13 13:56:38 -04:00
|
|
|
compileOptions_(compileOptions),
|
|
|
|
|
index_(0),
|
|
|
|
|
code_(0),
|
2019-04-09 23:24:10 -04:00
|
|
|
codeSize_(0),
|
|
|
|
|
workgroupGroupSegmentByteSize_(0),
|
2019-07-16 12:46:10 -05:00
|
|
|
kernargSegmentByteSize_(0) {
|
2018-08-28 17:30:29 -04:00
|
|
|
flags_.hsa_ = true;
|
2016-01-22 18:18:55 -05:00
|
|
|
}
|
|
|
|
|
|
2019-05-08 19:22:02 -04:00
|
|
|
HSAILKernel::~HSAILKernel() {}
|
2016-01-22 18:18:55 -05:00
|
|
|
|
2017-04-13 13:56:38 -04:00
|
|
|
bool HSAILKernel::init(amd::hsa::loader::Symbol* sym, bool finalize) {
|
2019-05-08 19:22:02 -04:00
|
|
|
#if defined(WITH_COMPILER_LIB)
|
2017-04-13 13:56:38 -04:00
|
|
|
acl_error error = ACL_SUCCESS;
|
|
|
|
|
std::string openClKernelName = openclMangledName(name());
|
|
|
|
|
flags_.internalKernel_ =
|
|
|
|
|
(compileOptions_.find("-cl-internal-kernel") != std::string::npos) ? true : false;
|
|
|
|
|
// compile kernel down to ISA
|
|
|
|
|
if (finalize) {
|
|
|
|
|
std::string options(compileOptions_.c_str());
|
|
|
|
|
options.append(" -just-kernel=");
|
|
|
|
|
options.append(openClKernelName.c_str());
|
|
|
|
|
// Append an option so that we can selectively enable a SCOption on CZ
|
|
|
|
|
// whenever IOMMUv2 is enabled.
|
|
|
|
|
if (dev().settings().svmFineGrainSystem_) {
|
|
|
|
|
options.append(" -sc-xnack-iommu");
|
|
|
|
|
}
|
|
|
|
|
error = aclCompile(dev().compiler(), prog().binaryElf(), options.c_str(), ACL_TYPE_CG,
|
|
|
|
|
ACL_TYPE_ISA, nullptr);
|
|
|
|
|
buildLog_ += aclGetCompilerLog(dev().compiler());
|
2016-06-13 17:53:13 -04:00
|
|
|
if (error != ACL_SUCCESS) {
|
2017-04-13 13:56:38 -04:00
|
|
|
LogError("Failed to finalize kernel");
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
aqlCreateHWInfo(sym);
|
|
|
|
|
|
|
|
|
|
// Pull out metadata from the ELF
|
|
|
|
|
size_t sizeOfArgList;
|
|
|
|
|
error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_ARGUMENT_ARRAY,
|
|
|
|
|
openClKernelName.c_str(), nullptr, &sizeOfArgList);
|
|
|
|
|
if (error != ACL_SUCCESS) {
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
char* aclArgList = new char[sizeOfArgList];
|
|
|
|
|
if (nullptr == aclArgList) {
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_ARGUMENT_ARRAY,
|
|
|
|
|
openClKernelName.c_str(), aclArgList, &sizeOfArgList);
|
|
|
|
|
if (error != ACL_SUCCESS) {
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
// Set the argList
|
2018-08-28 17:30:29 -04:00
|
|
|
InitParameters(reinterpret_cast<const aclArgData*>(aclArgList), argsBufferSize());
|
2017-04-13 13:56:38 -04:00
|
|
|
delete[] aclArgList;
|
|
|
|
|
|
|
|
|
|
size_t sizeOfWorkGroupSize;
|
|
|
|
|
error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_WORK_GROUP_SIZE,
|
|
|
|
|
openClKernelName.c_str(), nullptr, &sizeOfWorkGroupSize);
|
|
|
|
|
if (error != ACL_SUCCESS) {
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_WORK_GROUP_SIZE,
|
|
|
|
|
openClKernelName.c_str(), workGroupInfo_.compileSize_, &sizeOfWorkGroupSize);
|
|
|
|
|
if (error != ACL_SUCCESS) {
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Copy wavefront size
|
2017-09-22 11:05:41 -04:00
|
|
|
workGroupInfo_.wavefrontSize_ = dev().info().wavefrontWidth_;
|
2017-04-13 13:56:38 -04:00
|
|
|
// Find total workgroup size
|
|
|
|
|
if (workGroupInfo_.compileSize_[0] != 0) {
|
|
|
|
|
workGroupInfo_.size_ = workGroupInfo_.compileSize_[0] * workGroupInfo_.compileSize_[1] *
|
|
|
|
|
workGroupInfo_.compileSize_[2];
|
|
|
|
|
} else {
|
2018-08-17 17:52:29 -04:00
|
|
|
workGroupInfo_.size_ = dev().info().preferredWorkGroupSize_;
|
2017-04-13 13:56:38 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Pull out printf metadata from the ELF
|
|
|
|
|
size_t sizeOfPrintfList;
|
|
|
|
|
error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_GPU_PRINTF_ARRAY,
|
|
|
|
|
openClKernelName.c_str(), nullptr, &sizeOfPrintfList);
|
|
|
|
|
if (error != ACL_SUCCESS) {
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Make sure kernel has any printf info
|
|
|
|
|
if (0 != sizeOfPrintfList) {
|
|
|
|
|
char* aclPrintfList = new char[sizeOfPrintfList];
|
|
|
|
|
if (nullptr == aclPrintfList) {
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_GPU_PRINTF_ARRAY,
|
|
|
|
|
openClKernelName.c_str(), aclPrintfList, &sizeOfPrintfList);
|
2016-06-13 17:53:13 -04:00
|
|
|
if (error != ACL_SUCCESS) {
|
2017-04-13 13:56:38 -04:00
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Set the PrintfList
|
2018-08-28 18:48:05 -04:00
|
|
|
InitPrintf(reinterpret_cast<aclPrintfFmt*>(aclPrintfList));
|
2017-04-13 13:56:38 -04:00
|
|
|
delete[] aclPrintfList;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
aclMetadata md;
|
|
|
|
|
md.enqueue_kernel = false;
|
|
|
|
|
size_t sizeOfDeviceEnqueue = sizeof(md.enqueue_kernel);
|
|
|
|
|
error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_DEVICE_ENQUEUE,
|
|
|
|
|
openClKernelName.c_str(), &md.enqueue_kernel, &sizeOfDeviceEnqueue);
|
|
|
|
|
if (error != ACL_SUCCESS) {
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
flags_.dynamicParallelism_ = md.enqueue_kernel;
|
|
|
|
|
|
|
|
|
|
md.kernel_index = -1;
|
|
|
|
|
size_t sizeOfIndex = sizeof(md.kernel_index);
|
|
|
|
|
error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_KERNEL_INDEX,
|
|
|
|
|
openClKernelName.c_str(), &md.kernel_index, &sizeOfIndex);
|
|
|
|
|
if (error != ACL_SUCCESS) {
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
index_ = md.kernel_index;
|
|
|
|
|
|
|
|
|
|
size_t sizeOfWavesPerSimdHint = sizeof(workGroupInfo_.wavesPerSimdHint_);
|
|
|
|
|
error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_WAVES_PER_SIMD_HINT,
|
|
|
|
|
openClKernelName.c_str(), &workGroupInfo_.wavesPerSimdHint_,
|
|
|
|
|
&sizeOfWavesPerSimdHint);
|
|
|
|
|
if (error != ACL_SUCCESS) {
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
waveLimiter_.enable();
|
|
|
|
|
|
|
|
|
|
size_t sizeOfWorkGroupSizeHint = sizeof(workGroupInfo_.compileSizeHint_);
|
|
|
|
|
error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_WORK_GROUP_SIZE_HINT,
|
|
|
|
|
openClKernelName.c_str(), workGroupInfo_.compileSizeHint_,
|
|
|
|
|
&sizeOfWorkGroupSizeHint);
|
|
|
|
|
if (error != ACL_SUCCESS) {
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
size_t sizeOfVecTypeHint;
|
|
|
|
|
error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_VEC_TYPE_HINT,
|
|
|
|
|
openClKernelName.c_str(), NULL, &sizeOfVecTypeHint);
|
|
|
|
|
if (error != ACL_SUCCESS) {
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (0 != sizeOfVecTypeHint) {
|
|
|
|
|
char* VecTypeHint = new char[sizeOfVecTypeHint + 1];
|
|
|
|
|
if (NULL == VecTypeHint) {
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_VEC_TYPE_HINT,
|
|
|
|
|
openClKernelName.c_str(), VecTypeHint, &sizeOfVecTypeHint);
|
2016-06-13 17:53:13 -04:00
|
|
|
if (error != ACL_SUCCESS) {
|
2017-04-13 13:56:38 -04:00
|
|
|
return false;
|
2016-06-13 17:53:13 -04:00
|
|
|
}
|
2017-04-13 13:56:38 -04:00
|
|
|
VecTypeHint[sizeOfVecTypeHint] = '\0';
|
|
|
|
|
workGroupInfo_.compileVecTypeHint_ = std::string(VecTypeHint);
|
|
|
|
|
delete[] VecTypeHint;
|
|
|
|
|
}
|
2016-06-13 17:53:13 -04:00
|
|
|
|
2018-10-02 18:50:36 -04:00
|
|
|
#endif // defined(WITH_COMPILER_LIB)
|
2017-04-13 13:56:38 -04:00
|
|
|
return true;
|
2016-01-22 18:18:55 -05:00
|
|
|
}
|
|
|
|
|
|
2017-04-13 13:56:38 -04:00
|
|
|
const Device& HSAILKernel::dev() const { return reinterpret_cast<const Device&>(dev_); }
|
2016-01-22 18:18:55 -05:00
|
|
|
|
2017-04-13 13:56:38 -04:00
|
|
|
const HSAILProgram& HSAILKernel::prog() const {
|
|
|
|
|
return reinterpret_cast<const HSAILProgram&>(prog_);
|
2016-01-22 18:18:55 -05:00
|
|
|
}
|
|
|
|
|
|
2019-05-08 19:22:02 -04:00
|
|
|
hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments(VirtualGPU& gpu, const amd::Kernel& kernel,
|
|
|
|
|
const amd::NDRangeContainer& sizes,
|
2019-10-18 13:20:55 -04:00
|
|
|
const_address params,
|
2019-05-08 19:22:02 -04:00
|
|
|
size_t ldsAddress, uint64_t vmDefQueue,
|
|
|
|
|
uint64_t* vmParentWrap) const {
|
2019-10-18 13:20:55 -04:00
|
|
|
const_address parameters = params;
|
2018-06-12 18:57:20 -04:00
|
|
|
uint64_t argList;
|
|
|
|
|
address aqlArgBuf = gpu.managedBuffer().reserve(
|
2019-05-08 19:22:02 -04:00
|
|
|
argsBufferSize() + sizeof(hsa_kernel_dispatch_packet_t), &argList);
|
2018-06-12 18:57:20 -04:00
|
|
|
gpu.addVmMemory(gpu.managedBuffer().activeMemory());
|
2017-04-13 13:56:38 -04:00
|
|
|
|
|
|
|
|
if (dynamicParallelism()) {
|
|
|
|
|
// Provide the host parent AQL wrap object to the kernel
|
2018-04-06 12:39:00 -04:00
|
|
|
AmdAqlWrap wrap = {};
|
|
|
|
|
wrap.state = AQL_WRAP_BUSY;
|
2018-06-12 18:57:20 -04:00
|
|
|
*vmParentWrap = gpu.cb(1)->UploadDataToHw(&wrap, sizeof(AmdAqlWrap));
|
|
|
|
|
gpu.addVmMemory(gpu.cb(1)->ActiveMemory());
|
2017-04-13 13:56:38 -04:00
|
|
|
}
|
|
|
|
|
|
2019-10-18 13:20:55 -04:00
|
|
|
// The check below handles a special case of single context with multiple devices
|
|
|
|
|
// when the devices use different compilers(HSAIL and LC) and have different signatures
|
|
|
|
|
const amd::KernelSignature& signature =
|
|
|
|
|
(this->signature().version() == kernel.signature().version()) ?
|
|
|
|
|
kernel.signature() : this->signature();
|
|
|
|
|
|
|
|
|
|
// If signatures don't match, then patch the parameters
|
|
|
|
|
if (signature.version() != kernel.signature().version()) {
|
|
|
|
|
WriteAqlArgAt(aqlArgBuf, parameters, signature.paramsSize() - signature.at(0).offset_,
|
|
|
|
|
signature.at(0).offset_);
|
|
|
|
|
parameters = aqlArgBuf;
|
|
|
|
|
}
|
2017-04-13 13:56:38 -04:00
|
|
|
|
2018-06-12 18:57:20 -04:00
|
|
|
// Check if runtime has to setup hidden arguments
|
2018-06-13 17:09:22 -04:00
|
|
|
for (uint32_t i = signature.numParameters(); i < signature.numParametersAll(); ++i) {
|
|
|
|
|
const auto it = signature.at(i);
|
2018-06-12 18:57:20 -04:00
|
|
|
size_t offset;
|
|
|
|
|
switch (it.info_.oclObject_) {
|
|
|
|
|
case amd::KernelParameterDescriptor::HiddenNone:
|
2018-06-13 12:01:00 -04:00
|
|
|
// void* zero = 0;
|
|
|
|
|
// WriteAqlArgAt(const_cast<address>(parameters), &zero, it.size_, it.offset_);
|
2017-04-13 13:56:38 -04:00
|
|
|
break;
|
2018-06-12 18:57:20 -04:00
|
|
|
case amd::KernelParameterDescriptor::HiddenGlobalOffsetX:
|
|
|
|
|
offset = sizes.offset()[0];
|
|
|
|
|
WriteAqlArgAt(const_cast<address>(parameters), &offset, it.size_, it.offset_);
|
2017-04-13 13:56:38 -04:00
|
|
|
break;
|
2018-06-12 18:57:20 -04:00
|
|
|
case amd::KernelParameterDescriptor::HiddenGlobalOffsetY:
|
|
|
|
|
if (sizes.dimensions() >= 2) {
|
2019-05-08 19:22:02 -04:00
|
|
|
offset = sizes.offset()[1];
|
|
|
|
|
WriteAqlArgAt(const_cast<address>(parameters), &offset, it.size_, it.offset_);
|
2018-06-12 18:57:20 -04:00
|
|
|
}
|
2017-04-13 13:56:38 -04:00
|
|
|
break;
|
2018-06-12 18:57:20 -04:00
|
|
|
case amd::KernelParameterDescriptor::HiddenGlobalOffsetZ:
|
|
|
|
|
if (sizes.dimensions() >= 3) {
|
|
|
|
|
offset = sizes.offset()[2];
|
|
|
|
|
WriteAqlArgAt(const_cast<address>(parameters), &offset, it.size_, it.offset_);
|
|
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
case amd::KernelParameterDescriptor::HiddenPrintfBuffer:
|
2017-04-13 13:56:38 -04:00
|
|
|
if ((printfInfo().size() > 0) &&
|
|
|
|
|
// and printf buffer was allocated
|
|
|
|
|
(gpu.printfDbgHSA().dbgBuffer() != nullptr)) {
|
|
|
|
|
// and set the fourth argument as the printf_buffer pointer
|
2019-05-08 19:22:02 -04:00
|
|
|
size_t bufferPtr = static_cast<size_t>(gpu.printfDbgHSA().dbgBuffer()->vmAddress());
|
2017-08-24 13:51:55 -04:00
|
|
|
gpu.addVmMemory(gpu.printfDbgHSA().dbgBuffer());
|
2018-06-12 18:57:20 -04:00
|
|
|
WriteAqlArgAt(const_cast<address>(parameters), &bufferPtr, it.size_, it.offset_);
|
2018-03-23 14:12:49 -04:00
|
|
|
}
|
2017-04-13 13:56:38 -04:00
|
|
|
break;
|
2018-06-12 18:57:20 -04:00
|
|
|
case amd::KernelParameterDescriptor::HiddenDefaultQueue:
|
|
|
|
|
if (vmDefQueue != 0) {
|
|
|
|
|
WriteAqlArgAt(const_cast<address>(parameters), &vmDefQueue, it.size_, it.offset_);
|
2017-08-24 13:51:55 -04:00
|
|
|
}
|
2017-04-13 13:56:38 -04:00
|
|
|
break;
|
2018-06-12 18:57:20 -04:00
|
|
|
case amd::KernelParameterDescriptor::HiddenCompletionAction:
|
|
|
|
|
if (*vmParentWrap != 0) {
|
|
|
|
|
WriteAqlArgAt(const_cast<address>(parameters), vmParentWrap, it.size_, it.offset_);
|
2017-04-13 13:56:38 -04:00
|
|
|
}
|
|
|
|
|
break;
|
2019-10-16 11:24:09 -04:00
|
|
|
case amd::KernelParameterDescriptor::HiddenMultiGridSync:
|
|
|
|
|
break;
|
2016-01-22 18:18:55 -05:00
|
|
|
}
|
2017-04-13 13:56:38 -04:00
|
|
|
}
|
2016-01-22 18:18:55 -05:00
|
|
|
|
2018-06-12 18:57:20 -04:00
|
|
|
// Load all kernel arguments
|
2019-10-18 13:20:55 -04:00
|
|
|
if (signature.version() == kernel.signature().version()) {
|
|
|
|
|
WriteAqlArgAt(aqlArgBuf, parameters, argsBufferSize(), 0);
|
|
|
|
|
}
|
|
|
|
|
|
2018-06-13 12:01:00 -04:00
|
|
|
// Note: In a case of structs the size won't match,
|
|
|
|
|
// since HSAIL compiler expects a reference...
|
|
|
|
|
assert(argsBufferSize() <= signature.paramsSize() &&
|
2019-05-08 19:22:02 -04:00
|
|
|
"A mismatch of sizes of arguments between compiler and runtime!");
|
2016-01-22 18:18:55 -05:00
|
|
|
|
2019-05-08 19:22:02 -04:00
|
|
|
// hsa_kernel_dispatch_packet_t disp;
|
|
|
|
|
hsa_kernel_dispatch_packet_t* hsaDisp =
|
|
|
|
|
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(gpu.cb(0)->SysMemCopy());
|
2017-04-13 13:56:38 -04:00
|
|
|
|
|
|
|
|
amd::NDRange local(sizes.local());
|
|
|
|
|
const amd::NDRange& global = sizes.global();
|
|
|
|
|
|
|
|
|
|
// Check if runtime has to find local workgroup size
|
2018-08-29 12:35:08 -04:00
|
|
|
FindLocalWorkSize(sizes.dimensions(), sizes.global(), local);
|
2017-04-13 13:56:38 -04:00
|
|
|
|
2018-06-12 18:57:20 -04:00
|
|
|
constexpr uint16_t kDispatchPacketHeader =
|
2019-05-08 19:22:02 -04:00
|
|
|
(HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
|
|
|
|
|
(1 << HSA_PACKET_HEADER_BARRIER) |
|
|
|
|
|
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
|
|
|
|
|
(HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
|
2018-06-12 18:57:20 -04:00
|
|
|
|
2017-04-13 13:56:38 -04:00
|
|
|
hsaDisp->header = kDispatchPacketHeader;
|
|
|
|
|
hsaDisp->setup = sizes.dimensions();
|
|
|
|
|
|
|
|
|
|
hsaDisp->workgroup_size_x = local[0];
|
|
|
|
|
hsaDisp->workgroup_size_y = (sizes.dimensions() > 1) ? local[1] : 1;
|
|
|
|
|
hsaDisp->workgroup_size_z = (sizes.dimensions() > 2) ? local[2] : 1;
|
|
|
|
|
|
|
|
|
|
hsaDisp->grid_size_x = global[0];
|
|
|
|
|
hsaDisp->grid_size_y = (sizes.dimensions() > 1) ? global[1] : 1;
|
|
|
|
|
hsaDisp->grid_size_z = (sizes.dimensions() > 2) ? global[2] : 1;
|
|
|
|
|
hsaDisp->reserved2 = 0;
|
|
|
|
|
|
|
|
|
|
// Initialize kernel ISA and execution buffer requirements
|
|
|
|
|
hsaDisp->private_segment_size = spillSegSize();
|
2018-11-27 13:26:39 -05:00
|
|
|
hsaDisp->group_segment_size = ldsAddress;
|
2017-04-13 13:56:38 -04:00
|
|
|
hsaDisp->kernel_object = gpuAqlCode();
|
|
|
|
|
|
|
|
|
|
hsaDisp->kernarg_address = reinterpret_cast<void*>(argList);
|
|
|
|
|
hsaDisp->reserved2 = 0;
|
|
|
|
|
hsaDisp->completion_signal.handle = 0;
|
2018-06-12 18:57:20 -04:00
|
|
|
memcpy(aqlArgBuf + argsBufferSize(), hsaDisp, sizeof(hsa_kernel_dispatch_packet_t));
|
2017-04-13 13:56:38 -04:00
|
|
|
|
2018-09-10 11:57:07 -04:00
|
|
|
if (AMD_HSA_BITS_GET(akc_.kernel_code_properties,
|
2019-05-08 19:22:02 -04:00
|
|
|
AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_QUEUE_PTR)) {
|
2017-08-24 13:51:55 -04:00
|
|
|
gpu.addVmMemory(gpu.hsaQueueMem());
|
2017-04-13 13:56:38 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return hsaDisp;
|
2016-01-22 18:18:55 -05:00
|
|
|
}
|
|
|
|
|
|
2017-04-13 13:56:38 -04:00
|
|
|
const LightningProgram& LightningKernel::prog() const {
|
|
|
|
|
return reinterpret_cast<const LightningProgram&>(prog_);
|
2016-10-21 13:18:35 -04:00
|
|
|
}
|
|
|
|
|
|
2018-12-10 15:49:24 -05:00
|
|
|
#if defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY)
|
2017-04-13 13:56:38 -04:00
|
|
|
static const KernelMD* FindKernelMetadata(const CodeObjectMD* programMD, const std::string& name) {
|
|
|
|
|
for (const KernelMD& kernelMD : programMD->mKernels) {
|
|
|
|
|
if (kernelMD.mName == name) {
|
|
|
|
|
return &kernelMD;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return nullptr;
|
|
|
|
|
}
|
2019-05-08 19:22:02 -04:00
|
|
|
#endif // defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY)
|
2016-10-21 13:18:35 -04:00
|
|
|
|
2019-04-09 23:24:10 -04:00
|
|
|
#if defined(USE_COMGR_LIBRARY)
|
|
|
|
|
bool LightningKernel::init() {
|
2017-04-13 13:56:38 -04:00
|
|
|
flags_.internalKernel_ =
|
|
|
|
|
(compileOptions_.find("-cl-internal-kernel") != std::string::npos) ? true : false;
|
|
|
|
|
|
2019-01-04 15:06:29 -05:00
|
|
|
const amd_comgr_metadata_node_t* kernelMetaNode = prog().getKernelMetadata(name());
|
|
|
|
|
if (kernelMetaNode == nullptr) {
|
|
|
|
|
return false;
|
|
|
|
|
}
|
2018-11-22 14:04:51 -05:00
|
|
|
|
2019-05-08 19:22:02 -04:00
|
|
|
KernelMD kernelMD;
|
2019-04-09 23:24:10 -04:00
|
|
|
if (!GetAttrCodePropMetadata(*kernelMetaNode, &kernelMD)) {
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
symbolName_ = (codeObjectVer() == 2) ? name() : kernelMD.mSymbolName;
|
|
|
|
|
|
|
|
|
|
workgroupGroupSegmentByteSize_ = kernelMD.mCodeProps.mGroupSegmentFixedSize;
|
|
|
|
|
kernargSegmentByteSize_ = kernelMD.mCodeProps.mKernargSegmentSize;
|
|
|
|
|
|
|
|
|
|
// Copy codeobject of this kernel from the program CPU segment
|
|
|
|
|
hsa_agent_t agent;
|
|
|
|
|
agent.handle = 1;
|
|
|
|
|
|
|
|
|
|
auto sym = prog().GetSymbol(symbolName().c_str(), const_cast<hsa_agent_t*>(&agent));
|
|
|
|
|
|
|
|
|
|
if (!setKernelCode(sym, &akc_)) {
|
2018-11-22 14:04:51 -05:00
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
2019-06-28 15:01:03 -04:00
|
|
|
if (!prog().isNull()) {
|
|
|
|
|
codeSize_ = prog().codeSegGpu().owner()->getSize();
|
2019-04-09 23:24:10 -04:00
|
|
|
|
2019-06-28 15:01:03 -04:00
|
|
|
// handle device enqueue
|
|
|
|
|
if (!kernelMD.mAttrs.mRuntimeHandle.empty()) {
|
|
|
|
|
hsa_agent_t agent;
|
|
|
|
|
agent.handle = 1;
|
|
|
|
|
amd::hsa::loader::Symbol* rth_symbol;
|
2018-11-22 14:04:51 -05:00
|
|
|
|
2019-06-28 15:01:03 -04:00
|
|
|
// Get the runtime handle symbol GPU address
|
|
|
|
|
rth_symbol = prog().GetSymbol(const_cast<char*>(kernelMD.mAttrs.mRuntimeHandle.c_str()),
|
|
|
|
|
const_cast<hsa_agent_t*>(&agent));
|
|
|
|
|
uint64_t symbol_address;
|
|
|
|
|
rth_symbol->GetInfo(HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, &symbol_address);
|
2018-11-22 14:04:51 -05:00
|
|
|
|
2019-06-28 15:01:03 -04:00
|
|
|
// Copy the kernel_object pointer to the runtime handle symbol GPU address
|
|
|
|
|
const Memory& codeSegGpu = prog().codeSegGpu();
|
|
|
|
|
uint64_t offset = symbol_address - codeSegGpu.vmAddress();
|
|
|
|
|
uint64_t kernel_object = gpuAqlCode();
|
|
|
|
|
VirtualGPU* gpu = codeSegGpu.dev().xferQueue();
|
2018-11-22 14:04:51 -05:00
|
|
|
|
2019-06-28 15:01:03 -04:00
|
|
|
const struct RuntimeHandle runtime_handle = {gpuAqlCode(), spillSegSize(), ldsSize()};
|
2018-12-12 15:53:41 -05:00
|
|
|
|
2019-06-28 15:01:03 -04:00
|
|
|
codeSegGpu.writeRawData(*gpu, offset, sizeof(runtime_handle), &runtime_handle, true);
|
|
|
|
|
}
|
2018-11-22 14:04:51 -05:00
|
|
|
}
|
|
|
|
|
|
2019-04-09 23:24:10 -04:00
|
|
|
// Setup the the workgroup info
|
|
|
|
|
setWorkGroupInfo(kernelMD.mCodeProps.mPrivateSegmentFixedSize,
|
2019-05-08 19:22:02 -04:00
|
|
|
kernelMD.mCodeProps.mGroupSegmentFixedSize, kernelMD.mCodeProps.mNumSGPRs,
|
2019-04-09 23:24:10 -04:00
|
|
|
kernelMD.mCodeProps.mNumVGPRs);
|
|
|
|
|
|
2018-11-22 14:04:51 -05:00
|
|
|
// Copy wavefront size
|
|
|
|
|
workGroupInfo_.wavefrontSize_ = dev().info().wavefrontWidth_;
|
|
|
|
|
|
|
|
|
|
workGroupInfo_.size_ = kernelMD.mCodeProps.mMaxFlatWorkGroupSize;
|
|
|
|
|
if (workGroupInfo_.size_ == 0) {
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// handle the printf metadata if any
|
|
|
|
|
std::vector<std::string> printfStr;
|
2019-10-28 18:13:35 -04:00
|
|
|
if (!GetPrintfStr(&printfStr)) {
|
2018-11-22 14:04:51 -05:00
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (!printfStr.empty()) {
|
|
|
|
|
InitPrintf(printfStr);
|
|
|
|
|
}
|
2019-04-09 23:24:10 -04:00
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
}
|
2019-05-08 19:22:02 -04:00
|
|
|
#endif // defined(USE_COMGR_LIBRARY)
|
2019-04-09 23:24:10 -04:00
|
|
|
|
|
|
|
|
bool LightningKernel::init(amd::hsa::loader::Symbol* symbol) {
|
2019-05-08 19:22:02 -04:00
|
|
|
#if defined(WITH_LIGHTNING_COMPILER) && !defined(USE_COMGR_LIBRARY)
|
2019-04-09 23:24:10 -04:00
|
|
|
flags_.internalKernel_ =
|
|
|
|
|
(compileOptions_.find("-cl-internal-kernel") != std::string::npos) ? true : false;
|
|
|
|
|
|
|
|
|
|
aqlCreateHWInfo(symbol);
|
|
|
|
|
|
2017-04-13 13:56:38 -04:00
|
|
|
const CodeObjectMD* programMD = prog().metadata();
|
|
|
|
|
assert(programMD != nullptr);
|
|
|
|
|
|
|
|
|
|
const KernelMD* kernelMD = FindKernelMetadata(programMD, name());
|
|
|
|
|
|
|
|
|
|
if (kernelMD == nullptr) {
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Set the argList
|
2018-08-28 17:30:29 -04:00
|
|
|
InitParameters(*kernelMD, argsBufferSize());
|
2017-04-13 13:56:38 -04:00
|
|
|
|
|
|
|
|
if (!kernelMD->mAttrs.mReqdWorkGroupSize.empty()) {
|
|
|
|
|
const auto& requiredWorkgroupSize = kernelMD->mAttrs.mReqdWorkGroupSize;
|
|
|
|
|
workGroupInfo_.compileSize_[0] = requiredWorkgroupSize[0];
|
|
|
|
|
workGroupInfo_.compileSize_[1] = requiredWorkgroupSize[1];
|
|
|
|
|
workGroupInfo_.compileSize_[2] = requiredWorkgroupSize[2];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (!kernelMD->mAttrs.mWorkGroupSizeHint.empty()) {
|
|
|
|
|
const auto& workgroupSizeHint = kernelMD->mAttrs.mWorkGroupSizeHint;
|
|
|
|
|
workGroupInfo_.compileSizeHint_[0] = workgroupSizeHint[0];
|
|
|
|
|
workGroupInfo_.compileSizeHint_[1] = workgroupSizeHint[1];
|
|
|
|
|
workGroupInfo_.compileSizeHint_[2] = workgroupSizeHint[2];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (!kernelMD->mAttrs.mVecTypeHint.empty()) {
|
|
|
|
|
workGroupInfo_.compileVecTypeHint_ = kernelMD->mAttrs.mVecTypeHint.c_str();
|
|
|
|
|
}
|
|
|
|
|
|
2018-09-18 10:42:05 -04:00
|
|
|
if (!kernelMD->mAttrs.mRuntimeHandle.empty()) {
|
|
|
|
|
hsa_agent_t agent;
|
|
|
|
|
agent.handle = 1;
|
|
|
|
|
amd::hsa::loader::Symbol* rth_symbol;
|
|
|
|
|
|
|
|
|
|
// Get the runtime handle symbol GPU address
|
2019-04-09 23:24:10 -04:00
|
|
|
rth_symbol = prog().GetSymbol(const_cast<char*>(kernelMD->mAttrs.mRuntimeHandle.c_str()),
|
2019-05-08 19:22:02 -04:00
|
|
|
const_cast<hsa_agent_t*>(&agent));
|
2018-09-18 10:42:05 -04:00
|
|
|
uint64_t symbol_address;
|
|
|
|
|
rth_symbol->GetInfo(HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, &symbol_address);
|
|
|
|
|
|
|
|
|
|
// Copy the kernel_object pointer to the runtime handle symbol GPU address
|
2019-04-09 23:24:10 -04:00
|
|
|
const Memory& codeSegGpu = prog().codeSegGpu();
|
2018-09-18 10:42:05 -04:00
|
|
|
uint64_t offset = symbol_address - codeSegGpu.vmAddress();
|
|
|
|
|
VirtualGPU* gpu = codeSegGpu.dev().xferQueue();
|
|
|
|
|
|
2019-05-08 19:22:02 -04:00
|
|
|
const struct RuntimeHandle runtime_handle = {gpuAqlCode(), spillSegSize(), ldsSize()};
|
2018-11-16 11:57:05 -05:00
|
|
|
|
|
|
|
|
codeSegGpu.writeRawData(*gpu, offset, sizeof(runtime_handle), &runtime_handle, true);
|
2018-09-18 10:42:05 -04:00
|
|
|
}
|
|
|
|
|
|
2017-04-13 13:56:38 -04:00
|
|
|
// Copy wavefront size
|
2017-09-22 11:05:41 -04:00
|
|
|
workGroupInfo_.wavefrontSize_ = dev().info().wavefrontWidth_;
|
2017-11-03 16:37:11 -04:00
|
|
|
|
2018-08-17 17:52:29 -04:00
|
|
|
workGroupInfo_.size_ = kernelMD->mCodeProps.mMaxFlatWorkGroupSize;
|
2017-11-03 16:37:11 -04:00
|
|
|
if (workGroupInfo_.size_ == 0) {
|
|
|
|
|
return false;
|
2017-04-13 13:56:38 -04:00
|
|
|
}
|
|
|
|
|
|
2018-08-28 18:48:05 -04:00
|
|
|
InitPrintf(programMD->mPrintf);
|
2017-04-13 13:56:38 -04:00
|
|
|
|
|
|
|
|
/*FIXME_lmoriche:
|
|
|
|
|
size_t sizeOfWavesPerSimdHint = sizeof(workGroupInfo_.wavesPerSimdHint_);
|
|
|
|
|
error = aclQueryInfo(dev().compiler(), prog().binaryElf(),
|
|
|
|
|
RT_WAVES_PER_SIMD_HINT, openClKernelName.c_str(),
|
|
|
|
|
&workGroupInfo_.wavesPerSimdHint_, &sizeOfWavesPerSimdHint);
|
|
|
|
|
if (error != ACL_SUCCESS) {
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
waveLimiter_.enable();
|
|
|
|
|
*/
|
2019-05-08 19:22:02 -04:00
|
|
|
#endif // defined(WITH_LIGHTNING_COMPILER) && ! defined(USE_COMGR_LIBRARY)
|
2017-04-13 13:56:38 -04:00
|
|
|
return true;
|
2016-10-21 13:18:35 -04:00
|
|
|
}
|
|
|
|
|
|
2017-04-13 13:56:38 -04:00
|
|
|
} // namespace pal
|