P4 to Git Change 1567428 by gandryey@gera-w8 on 2018/06/12 18:39:23
SWDEV-79445 - OCL generic changes and code clean-up
- Optimize setup of kernel arguments. Stage 2.
- Add HW ABI support in the abstraction layer
- Remove arguments parsing loop from the kernel launch. Memory processing will be responsible for dependency tracking and patching of arguments.
http://ocltc.amd.com/reviews/r/15122/
Affected files ...
... //depot/stg/opencl/drivers/opencl/runtime/device/device.cpp#221 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/device.hpp#307 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpukernel.cpp#325 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palblit.cpp#24 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palkernel.cpp#53 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palkernel.hpp#17 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palmemory.hpp#9 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.cpp#107 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.hpp#53 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rockernel.cpp#36 edit
... //depot/stg/opencl/drivers/opencl/runtime/platform/kernel.cpp#30 edit
... //depot/stg/opencl/drivers/opencl/runtime/platform/kernel.hpp#23 edit
... //depot/stg/opencl/drivers/opencl/runtime/platform/program.cpp#95 edit
[ROCm/clr commit: 1be400ff01]
This commit is contained in:
@@ -600,7 +600,9 @@ Settings::Settings() {
|
||||
//!< concurrent Virtual GPUs for default
|
||||
}
|
||||
|
||||
bool Kernel::createSignature(const parameters_t& params) {
|
||||
bool Kernel::createSignature(
|
||||
const parameters_t& params, const parameters_t& hiddenParams,
|
||||
uint32_t version) {
|
||||
std::stringstream attribs;
|
||||
if (workGroupInfo_.compileSize_[0] != 0) {
|
||||
attribs << "reqd_work_group_size(";
|
||||
@@ -632,7 +634,7 @@ bool Kernel::createSignature(const parameters_t& params) {
|
||||
// Destroy old signature if it was allocated before
|
||||
// (offline devices path)
|
||||
delete signature_;
|
||||
signature_ = new amd::KernelSignature(params, attribs.str());
|
||||
signature_ = new amd::KernelSignature(params, attribs.str(), hiddenParams, version);
|
||||
if (NULL != signature_) {
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -852,7 +852,9 @@ class Kernel : public amd::HeapObject {
|
||||
const std::string& name() const { return name_; }
|
||||
|
||||
//! Initializes the kernel parameters for the abstraction layer
|
||||
bool createSignature(const parameters_t& params);
|
||||
bool createSignature(
|
||||
const parameters_t& params, const parameters_t& hiddenParams,
|
||||
uint32_t version);
|
||||
|
||||
//! Returns TRUE if it's a HSA kernel
|
||||
bool hsa() const { return hsa_; }
|
||||
@@ -1624,6 +1626,22 @@ class Device : public RuntimeObject {
|
||||
};
|
||||
|
||||
struct KernelParameterDescriptor {
|
||||
enum {
|
||||
Value = 0,
|
||||
HiddenNone = 1,
|
||||
HiddenGlobalOffsetX = 2,
|
||||
HiddenGlobalOffsetY = 3,
|
||||
HiddenGlobalOffsetZ = 4,
|
||||
HiddenPrintfBuffer = 5,
|
||||
HiddenDefaultQueue = 6,
|
||||
HiddenCompletionAction = 7,
|
||||
MemoryObject = 8,
|
||||
ReferenceObject = 9,
|
||||
ValueObject = 10,
|
||||
ImageObject = 11,
|
||||
SamplerObject = 12,
|
||||
QueueObject = 13
|
||||
};
|
||||
const char* name_; //!< The parameter's name in the source
|
||||
clk_value_type_t type_; //!< The parameter's type
|
||||
size_t offset_; //!< Its offset in the parameter's stack
|
||||
@@ -1642,7 +1660,7 @@ struct KernelParameterDescriptor {
|
||||
uint32_t rawPointer_ : 1; //!< Arguments have a raw GPU VA
|
||||
uint32_t defined_ : 1; //!< The argument was defined by the app
|
||||
uint32_t reserved_ : 1; //!< reserved
|
||||
uint32_t arrayIndex_ : 28; //!< Index in the objects array
|
||||
uint32_t arrayIndex_ : 24; //!< Index in the objects array or LDS alignment
|
||||
};
|
||||
uint32_t allValues_;
|
||||
InfoData() : allValues_(0) {}
|
||||
|
||||
@@ -752,7 +752,8 @@ bool NullKernel::create(const std::string& code, const std::string& metadata,
|
||||
workGroupInfo_.usedStackSize_ = calFuncInfo.stackSizeUsed;
|
||||
|
||||
device::Kernel::parameters_t params;
|
||||
if (!createSignature(params)) {
|
||||
device::Kernel::parameters_t hiddenParams;
|
||||
if (!createSignature(params, hiddenParams, amd::KernelSignature::ABIVersion_0)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -1337,7 +1338,8 @@ bool Kernel::initParameters() {
|
||||
workGroupInfo_.localMemSize_ = hwLocalSize_;
|
||||
}
|
||||
|
||||
if (!createSignature(params)) {
|
||||
device::Kernel::parameters_t hiddenParams;
|
||||
if (!createSignature(params, hiddenParams, amd::KernelSignature::ABIVersion_0)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -3017,7 +3019,8 @@ void HSAILKernel::initArgList(const aclArgData* aclArg) {
|
||||
}
|
||||
}
|
||||
|
||||
createSignature(params);
|
||||
device::Kernel::parameters_t hiddenParams;
|
||||
createSignature(params, hiddenParams, amd::KernelSignature::ABIVersion_0);
|
||||
}
|
||||
|
||||
void HSAILKernel::initHsailArgs(const aclArgData* aclArg) {
|
||||
|
||||
@@ -943,24 +943,30 @@ static void setArgument(amd::Kernel* kernel, size_t index, size_t size, const vo
|
||||
|
||||
uint32_t uint32_value = 0;
|
||||
uint64_t uint64_value = 0;
|
||||
size_t argSize = desc.size_;
|
||||
|
||||
if (desc.type_ == T_POINTER && desc.size_ != 0) {
|
||||
if ((value == NULL) || (static_cast<const cl_mem*>(value) == NULL)) {
|
||||
LP64_SWITCH(uint32_value, uint64_value) = 0;
|
||||
reinterpret_cast<Memory**>(kernel->parameters().values() +
|
||||
kernel->parameters().memoryObjOffset())[desc.info_.arrayIndex_] = nullptr;
|
||||
} else {
|
||||
// convert cl_mem to amd::Memory*, return false if invalid.
|
||||
LP64_SWITCH(uint32_value, uint64_value) = static_cast<uintptr_t>((
|
||||
*static_cast<Memory* const*>(value))->vmAddress());
|
||||
*static_cast<Memory* const*>(value))->virtualAddress());
|
||||
reinterpret_cast<Memory**>(kernel->parameters().values() +
|
||||
kernel->parameters().memoryObjOffset())[desc.info_.arrayIndex_] =
|
||||
*static_cast<Memory* const*>(value);
|
||||
// Note: Special case for image SRD, which is 64 bit always
|
||||
if (LP64_SWITCH(true, false) &&
|
||||
(desc.info_.oclObject_ == amd::KernelParameterDescriptor::ImageObject)) {
|
||||
uint64_value = uint32_value;
|
||||
argSize = sizeof(uint64_t);
|
||||
}
|
||||
}
|
||||
} else if (desc.type_ == T_SAMPLER) {
|
||||
assert(false && "No sampler support in blit manager! Use internal samplers!");
|
||||
} else
|
||||
switch (desc.size_) {
|
||||
switch (argSize) {
|
||||
case 1:
|
||||
uint32_value = *static_cast<const uint8_t*>(value);
|
||||
break;
|
||||
@@ -977,7 +983,7 @@ static void setArgument(amd::Kernel* kernel, size_t index, size_t size, const vo
|
||||
break;
|
||||
}
|
||||
|
||||
switch (desc.size_) {
|
||||
switch (argSize) {
|
||||
case 0 /*local mem*/:
|
||||
*static_cast<size_t*>(param) = size;
|
||||
break;
|
||||
|
||||
@@ -228,6 +228,37 @@ inline static int GetHSAILArgSize(const aclArgData* argInfo) {
|
||||
}
|
||||
}
|
||||
|
||||
inline static uint32_t GetOclArgumentType(const HSAILKernel::Argument* arg) {
|
||||
switch (arg->type_){
|
||||
case HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_X:
|
||||
return amd::KernelParameterDescriptor::HiddenGlobalOffsetX;
|
||||
case HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Y:
|
||||
return amd::KernelParameterDescriptor::HiddenGlobalOffsetY;
|
||||
case HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Z:
|
||||
return amd::KernelParameterDescriptor::HiddenGlobalOffsetZ;
|
||||
case HSAIL_ARGTYPE_HIDDEN_PRINTF_BUFFER:
|
||||
return amd::KernelParameterDescriptor::HiddenPrintfBuffer;
|
||||
case HSAIL_ARGTYPE_HIDDEN_DEFAULT_QUEUE:
|
||||
return amd::KernelParameterDescriptor::HiddenDefaultQueue;
|
||||
case HSAIL_ARGTYPE_HIDDEN_COMPLETION_ACTION:
|
||||
return amd::KernelParameterDescriptor::HiddenCompletionAction;
|
||||
case HSAIL_ARGTYPE_POINTER:
|
||||
return amd::KernelParameterDescriptor::MemoryObject;
|
||||
case HSAIL_ARGTYPE_IMAGE:
|
||||
return amd::KernelParameterDescriptor::ImageObject;
|
||||
case HSAIL_ARGTYPE_REFERENCE:
|
||||
return amd::KernelParameterDescriptor::ReferenceObject;
|
||||
case HSAIL_ARGTYPE_VALUE:
|
||||
return amd::KernelParameterDescriptor::ValueObject;
|
||||
case HSAIL_ARGTYPE_SAMPLER:
|
||||
return amd::KernelParameterDescriptor::SamplerObject;
|
||||
case HSAIL_ARGTYPE_QUEUE:
|
||||
return amd::KernelParameterDescriptor::QueueObject;
|
||||
default:
|
||||
return amd::KernelParameterDescriptor::HiddenNone;
|
||||
}
|
||||
}
|
||||
|
||||
inline static clk_value_type_t GetOclType(const HSAILKernel::Argument* arg) {
|
||||
static const clk_value_type_t ClkValueMapType[6][6] = {
|
||||
{T_CHAR, T_CHAR2, T_CHAR3, T_CHAR4, T_CHAR8, T_CHAR16},
|
||||
@@ -422,12 +453,22 @@ void HSAILKernel::initArgList(const aclArgData* aclArg) {
|
||||
|
||||
// Iterate through the arguments and insert into parameterList
|
||||
device::Kernel::parameters_t params;
|
||||
device::Kernel::parameters_t hiddenParams;
|
||||
amd::KernelParameterDescriptor desc;
|
||||
size_t offset = 0;
|
||||
size_t offsetStruct = argsBufferSize();
|
||||
|
||||
for (uint i = 0; aclArg->struct_size != 0; i++, aclArg++) {
|
||||
// skip the hidden arguments
|
||||
if (arguments_[i]->index_ == uint(-1)) continue;
|
||||
// Allocate the hidden arguments, but abstraction layer will skip them
|
||||
if (arguments_[i]->index_ == uint(-1)) {
|
||||
offset = amd::alignUp(offset, arguments_[i]->alignment_);
|
||||
desc.offset_ = offset;
|
||||
desc.size_ = arguments_[i]->size_;
|
||||
offset += arguments_[i]->size_;
|
||||
desc.info_.oclObject_ = GetOclArgumentType(arguments_[i]);
|
||||
hiddenParams.push_back(desc);
|
||||
continue;
|
||||
}
|
||||
|
||||
desc.name_ = arguments_[i]->name_.c_str();
|
||||
desc.type_ = GetOclType(arguments_[i]);
|
||||
@@ -435,6 +476,8 @@ void HSAILKernel::initArgList(const aclArgData* aclArg) {
|
||||
desc.accessQualifier_ = GetOclAccessQual(arguments_[i]);
|
||||
desc.typeQualifier_ = GetOclTypeQual(aclArg);
|
||||
desc.typeName_ = arguments_[i]->typeName_.c_str();
|
||||
desc.info_.oclObject_ = GetOclArgumentType(arguments_[i]);
|
||||
desc.info_.arrayIndex_ = arguments_[i]->pointeeAlignment_;
|
||||
|
||||
// Make a check if it is local or global
|
||||
if (desc.addressQualifier_ == CL_KERNEL_ARG_ADDRESS_LOCAL) {
|
||||
@@ -451,9 +494,32 @@ void HSAILKernel::initArgList(const aclArgData* aclArg) {
|
||||
// Local memory for CPU
|
||||
size = sizeof(cl_mem);
|
||||
}
|
||||
offset = amd::alignUp(offset, std::min(size, size_t(16)));
|
||||
desc.offset_ = offset;
|
||||
offset += amd::alignUp(size, sizeof(uint32_t));
|
||||
// Check if HSAIL expects data by reference and allocate it behind
|
||||
if (arguments_[i]->type_ == HSAIL_ARGTYPE_REFERENCE) {
|
||||
desc.offset_ = offsetStruct;
|
||||
// Align the offset reference
|
||||
offset = amd::alignUp(offset, sizeof(size_t));
|
||||
patchReferences_.insert({desc.offset_, offset});
|
||||
offsetStruct += size;
|
||||
// Adjust the offset of arguments
|
||||
offset += sizeof(size_t);
|
||||
} else {
|
||||
// These objects have forced data size to uint64_t
|
||||
if ((desc.info_.oclObject_ == amd::KernelParameterDescriptor::ImageObject) ||
|
||||
(desc.info_.oclObject_ == amd::KernelParameterDescriptor::SamplerObject) ||
|
||||
(desc.info_.oclObject_ == amd::KernelParameterDescriptor::QueueObject)) {
|
||||
offset = amd::alignUp(offset, sizeof(uint64_t));
|
||||
desc.offset_ = offset;
|
||||
offset += sizeof(uint64_t);
|
||||
} else {
|
||||
offset = amd::alignUp(offset, arguments_[i]->alignment_);
|
||||
desc.offset_ = offset;
|
||||
offset += size;
|
||||
}
|
||||
}
|
||||
// Update read only flag
|
||||
desc.info_.readOnly_ = (arguments_[i]->access_ == HSAIL_ACCESS_TYPE_RO) ? true : false;
|
||||
|
||||
params.push_back(desc);
|
||||
|
||||
if (arguments_[i]->type_ == HSAIL_ARGTYPE_IMAGE) {
|
||||
@@ -464,7 +530,7 @@ void HSAILKernel::initArgList(const aclArgData* aclArg) {
|
||||
}
|
||||
}
|
||||
|
||||
createSignature(params);
|
||||
createSignature(params, hiddenParams, amd::KernelSignature::ABIVersion_1);
|
||||
}
|
||||
|
||||
void HSAILKernel::initHsailArgs(const aclArgData* aclArg) {
|
||||
@@ -869,247 +935,79 @@ void HSAILKernel::findLocalWorkSize(size_t workDim, const amd::NDRange& gblWorkS
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline void WriteAqlArg(
|
||||
unsigned char** dst, //!< The write pointer to the buffer
|
||||
const T* src, //!< The source pointer
|
||||
uint size, //!< The size in bytes to copy
|
||||
uint alignment //!< The alignment to follow while writing to the buffer
|
||||
) {
|
||||
*dst = amd::alignUp(*dst, alignment);
|
||||
memcpy(*dst, src, size);
|
||||
*dst += size;
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void WriteAqlArg(
|
||||
unsigned char** dst, //!< The write pointer to the buffer
|
||||
const uint32_t* src, //!< The source pointer
|
||||
uint size, //!< The size in bytes to copy
|
||||
uint alignment //!< The alignment to follow while writing to the buffer
|
||||
) {
|
||||
*dst = amd::alignUp(*dst, alignment);
|
||||
*(reinterpret_cast<uint32_t*>(*dst)) = *src;
|
||||
*dst += size;
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void WriteAqlArg(
|
||||
unsigned char** dst, //!< The write pointer to the buffer
|
||||
const uint64_t* src, //!< The source pointer
|
||||
uint size, //!< The size in bytes to copy
|
||||
uint alignment //!< The alignment to follow while writing to the buffer
|
||||
) {
|
||||
*dst = amd::alignUp(*dst, alignment);
|
||||
*(reinterpret_cast<uint64_t*>(*dst)) = *src;
|
||||
*dst += size;
|
||||
}
|
||||
|
||||
const uint16_t kDispatchPacketHeader = (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
|
||||
(1 << HSA_PACKET_HEADER_BARRIER) |
|
||||
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
|
||||
(HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
|
||||
|
||||
hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments(
|
||||
VirtualGPU& gpu, const amd::Kernel& kernel, const amd::NDRangeContainer& sizes,
|
||||
const_address parameters, bool nativeMem, uint64_t vmDefQueue, uint64_t* vmParentWrap) const {
|
||||
static const bool WaitOnBusyEngine = true;
|
||||
uint64_t ldsAddress = ldsSize();
|
||||
address aqlArgBuf = gpu.cb(0)->SysMemCopy();
|
||||
bool srdResource = false;
|
||||
const_address parameters, size_t ldsAddress, uint64_t vmDefQueue, uint64_t* vmParentWrap) const {
|
||||
uint64_t argList;
|
||||
address aqlArgBuf = gpu.managedBuffer().reserve(
|
||||
argsBufferSize() + sizeof(hsa_kernel_dispatch_packet_t), &argList);
|
||||
gpu.addVmMemory(gpu.managedBuffer().activeMemory());
|
||||
|
||||
if (dynamicParallelism()) {
|
||||
// Provide the host parent AQL wrap object to the kernel
|
||||
AmdAqlWrap wrap = {};
|
||||
wrap.state = AQL_WRAP_BUSY;
|
||||
const ConstantBuffer* cb = gpu.cb(1);
|
||||
*vmParentWrap = cb->UploadDataToHw(&wrap, sizeof(AmdAqlWrap));
|
||||
gpu.addVmMemory(cb->ActiveMemory());
|
||||
*vmParentWrap = gpu.cb(1)->UploadDataToHw(&wrap, sizeof(AmdAqlWrap));
|
||||
gpu.addVmMemory(gpu.cb(1)->ActiveMemory());
|
||||
}
|
||||
|
||||
const amd::KernelSignature& signature = kernel.signature();
|
||||
const amd::KernelParameters& kernelParams = kernel.parameters();
|
||||
amd::Memory* const* memories =
|
||||
reinterpret_cast<amd::Memory* const*>(parameters + kernelParams.memoryObjOffset());
|
||||
|
||||
// Find all parameters for the current kernel
|
||||
for (auto arg : arguments_) {
|
||||
const_address paramaddr = nullptr;
|
||||
if (arg->index_ != uint(-1)) {
|
||||
paramaddr = parameters + signature.at(arg->index_).offset_;
|
||||
}
|
||||
|
||||
// Handle the hidden arguments first, as they do not have a
|
||||
// matching parameter in the OCL signature (not a valid arg->index_)
|
||||
switch (arg->type_) {
|
||||
case HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_X: {
|
||||
size_t offset_x = sizes.dimensions() >= 1 ? sizes.offset()[0] : 0;
|
||||
assert(arg->size_ == sizeof(offset_x) && "check the sizes");
|
||||
WriteAqlArg(&aqlArgBuf, &offset_x, arg->size_, arg->alignment_);
|
||||
// Check if runtime has to setup hidden arguments
|
||||
for (const auto& it : signature.hiddenParameters()) {
|
||||
size_t offset;
|
||||
switch (it.info_.oclObject_) {
|
||||
case amd::KernelParameterDescriptor::HiddenNone:
|
||||
//WriteAqlArgAt(aqlArgBuf, &zero, it.size_, it.offset_);
|
||||
break;
|
||||
}
|
||||
case HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Y: {
|
||||
size_t offset_y = sizes.dimensions() >= 2 ? sizes.offset()[1] : 0;
|
||||
assert(arg->size_ == sizeof(offset_y) && "check the sizes");
|
||||
WriteAqlArg(&aqlArgBuf, &offset_y, arg->size_, arg->alignment_);
|
||||
case amd::KernelParameterDescriptor::HiddenGlobalOffsetX:
|
||||
offset = sizes.offset()[0];
|
||||
WriteAqlArgAt(const_cast<address>(parameters), &offset, it.size_, it.offset_);
|
||||
break;
|
||||
}
|
||||
case HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Z: {
|
||||
size_t offset_z = sizes.dimensions() == 3 ? sizes.offset()[2] : 0;
|
||||
assert(arg->size_ == sizeof(offset_z) && "check the sizes");
|
||||
WriteAqlArg(&aqlArgBuf, &offset_z, arg->size_, arg->alignment_);
|
||||
case amd::KernelParameterDescriptor::HiddenGlobalOffsetY:
|
||||
if (sizes.dimensions() >= 2) {
|
||||
offset = sizes.offset()[1];
|
||||
WriteAqlArgAt(const_cast<address>(parameters), &offset, it.size_, it.offset_);
|
||||
}
|
||||
break;
|
||||
}
|
||||
case HSAIL_ARGTYPE_HIDDEN_PRINTF_BUFFER: {
|
||||
size_t bufferPtr = 0;
|
||||
case amd::KernelParameterDescriptor::HiddenGlobalOffsetZ:
|
||||
if (sizes.dimensions() >= 3) {
|
||||
offset = sizes.offset()[2];
|
||||
WriteAqlArgAt(const_cast<address>(parameters), &offset, it.size_, it.offset_);
|
||||
}
|
||||
break;
|
||||
case amd::KernelParameterDescriptor::HiddenPrintfBuffer:
|
||||
if ((printfInfo().size() > 0) &&
|
||||
// and printf buffer was allocated
|
||||
(gpu.printfDbgHSA().dbgBuffer() != nullptr)) {
|
||||
// and set the fourth argument as the printf_buffer pointer
|
||||
bufferPtr = static_cast<size_t>(gpu.printfDbgHSA().dbgBuffer()->vmAddress());
|
||||
size_t bufferPtr = static_cast<size_t>(gpu.printfDbgHSA().
|
||||
dbgBuffer()->vmAddress());
|
||||
gpu.addVmMemory(gpu.printfDbgHSA().dbgBuffer());
|
||||
}
|
||||
assert(arg->size_ == sizeof(bufferPtr) && "check the sizes");
|
||||
WriteAqlArg(&aqlArgBuf, &bufferPtr, arg->size_, arg->alignment_);
|
||||
break;
|
||||
}
|
||||
case HSAIL_ARGTYPE_HIDDEN_DEFAULT_QUEUE:
|
||||
assert(arg->size_ == sizeof(static_cast<size_t>(vmDefQueue)) && "check the sizes");
|
||||
WriteAqlArg(&aqlArgBuf, &vmDefQueue, arg->size_, arg->alignment_);
|
||||
break;
|
||||
case HSAIL_ARGTYPE_HIDDEN_COMPLETION_ACTION:
|
||||
assert(arg->size_ == sizeof(static_cast<size_t>(*vmParentWrap)) && "check the sizes");
|
||||
WriteAqlArg(&aqlArgBuf, vmParentWrap, arg->size_, arg->alignment_);
|
||||
break;
|
||||
case HSAIL_ARGTYPE_HIDDEN_NONE: {
|
||||
void* zero = 0;
|
||||
assert(arg->size_ <= sizeof(zero) && "check the sizes");
|
||||
WriteAqlArg(&aqlArgBuf, &zero, arg->size_, arg->alignment_);
|
||||
break;
|
||||
}
|
||||
case HSAIL_ARGTYPE_POINTER: {
|
||||
// If it is a local pointer
|
||||
if (arg->addrQual_ == HSAIL_ADDRESS_LOCAL) {
|
||||
ldsAddress = amd::alignUp(ldsAddress, arg->pointeeAlignment_);
|
||||
WriteAqlArg(&aqlArgBuf, &ldsAddress, arg->size_, arg->alignment_);
|
||||
ldsAddress += *reinterpret_cast<const size_t*>(paramaddr);
|
||||
break;
|
||||
}
|
||||
assert(
|
||||
(arg->addrQual_ == HSAIL_ADDRESS_GLOBAL || arg->addrQual_ == HSAIL_ADDRESS_CONSTANT) &&
|
||||
"Unsupported address qualifier");
|
||||
WriteAqlArg(&aqlArgBuf, paramaddr, sizeof(paramaddr), sizeof(paramaddr));
|
||||
break;
|
||||
}
|
||||
case HSAIL_ARGTYPE_REFERENCE: {
|
||||
const ConstantBuffer* cb = gpu.cb(1);
|
||||
// Copy the current structure into CB1
|
||||
size_t gpuPtr = static_cast<size_t>(cb->UploadDataToHw(paramaddr, arg->size_));
|
||||
// Then use a pointer in aqlArgBuffer to CB1
|
||||
WriteAqlArg(&aqlArgBuf, &gpuPtr, sizeof(size_t), sizeof(size_t));
|
||||
gpu.addVmMemory(cb->ActiveMemory());
|
||||
break;
|
||||
}
|
||||
case HSAIL_ARGTYPE_VALUE:
|
||||
if (arg->size_ == sizeof(uint32_t)) {
|
||||
WriteAqlArg(&aqlArgBuf, reinterpret_cast<const uint32_t*>(paramaddr),
|
||||
sizeof(uint32_t), arg->alignment_);
|
||||
} else if (arg->size_ == sizeof(uint64_t)) {
|
||||
WriteAqlArg(&aqlArgBuf, reinterpret_cast<const uint64_t*>(paramaddr),
|
||||
sizeof(uint64_t), arg->alignment_);
|
||||
} else {
|
||||
WriteAqlArg(&aqlArgBuf, paramaddr, arg->size_, arg->alignment_);
|
||||
WriteAqlArgAt(const_cast<address>(parameters), &bufferPtr, it.size_, it.offset_);
|
||||
}
|
||||
break;
|
||||
case HSAIL_ARGTYPE_IMAGE: {
|
||||
Image* image = nullptr;
|
||||
amd::Memory* mem = nullptr;
|
||||
uint32_t index = signature.at(arg->index_).info_.arrayIndex_;
|
||||
if (nativeMem) {
|
||||
image = reinterpret_cast<Image* const*>(memories)[index];
|
||||
if (nullptr != image) {
|
||||
mem = image->owner();
|
||||
}
|
||||
} else {
|
||||
mem = memories[index];
|
||||
if (mem != nullptr) {
|
||||
image = static_cast<Image*>(dev().getGpuMemory(mem));
|
||||
}
|
||||
}
|
||||
|
||||
//! \note Special case for the image views.
|
||||
//! Copy SRD to CB1, so blit manager will be able to release
|
||||
//! this view without a wait for SRD resource.
|
||||
if (image->memoryType() == Resource::ImageView) {
|
||||
// Copy the current image SRD into CB1
|
||||
const ConstantBuffer* cb = gpu.cb(1);
|
||||
uint64_t srd = cb->UploadDataToHw(image->hwState(), HsaImageObjectSize);
|
||||
// Then use a pointer in aqlArgBuffer to CB1
|
||||
WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd), sizeof(srd));
|
||||
gpu.addVmMemory(cb->ActiveMemory());
|
||||
} else {
|
||||
uint64_t srd = image->hwSrd();
|
||||
WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd), sizeof(srd));
|
||||
srdResource = true;
|
||||
}
|
||||
|
||||
if (image->desc().isDoppTexture_) {
|
||||
gpu.addDoppRef(image, kernel.parameters().getExecNewVcop(),
|
||||
kernel.parameters().getExecPfpaVcop());
|
||||
case amd::KernelParameterDescriptor::HiddenDefaultQueue:
|
||||
if (vmDefQueue != 0) {
|
||||
WriteAqlArgAt(const_cast<address>(parameters), &vmDefQueue, it.size_, it.offset_);
|
||||
}
|
||||
break;
|
||||
}
|
||||
case HSAIL_ARGTYPE_SAMPLER: {
|
||||
uint32_t index = signature.at(arg->index_).info_.arrayIndex_;
|
||||
const amd::Sampler* sampler = reinterpret_cast<amd::Sampler* const*>(parameters +
|
||||
kernelParams.samplerObjOffset())[index];
|
||||
const Sampler* gpuSampler = static_cast<Sampler*>(sampler->getDeviceSampler(dev()));
|
||||
uint64_t srd = gpuSampler->hwSrd();
|
||||
WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd), sizeof(srd));
|
||||
srdResource = true;
|
||||
break;
|
||||
}
|
||||
case HSAIL_ARGTYPE_QUEUE: {
|
||||
uint32_t index = signature.at(arg->index_).info_.arrayIndex_;
|
||||
const amd::DeviceQueue* queue = reinterpret_cast<amd::DeviceQueue* const*>(
|
||||
parameters + kernelParams.queueObjOffset())[index];
|
||||
VirtualGPU* gpuQueue = static_cast<VirtualGPU*>(queue->vDev());
|
||||
uint64_t vmQueue;
|
||||
if (dev().settings().useDeviceQueue_) {
|
||||
vmQueue = gpuQueue->vQueue()->vmAddress();
|
||||
} else {
|
||||
if (!gpu.createVirtualQueue(queue->size())) {
|
||||
LogError("Virtual queue creation failed!");
|
||||
return nullptr;
|
||||
}
|
||||
vmQueue = gpu.vQueue()->vmAddress();
|
||||
case amd::KernelParameterDescriptor::HiddenCompletionAction:
|
||||
if (*vmParentWrap != 0) {
|
||||
WriteAqlArgAt(const_cast<address>(parameters), vmParentWrap, it.size_, it.offset_);
|
||||
}
|
||||
WriteAqlArg(&aqlArgBuf, &vmQueue, sizeof(vmQueue), sizeof(vmQueue));
|
||||
break;
|
||||
}
|
||||
default:
|
||||
LogError(" Unsupported argument type ");
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
if (ldsAddress > dev().info().localMemSize_) {
|
||||
LogError("No local memory available\n");
|
||||
return nullptr;
|
||||
}
|
||||
// Load all kernel arguments
|
||||
WriteAqlArgAt(aqlArgBuf, parameters, signature.paramsSize(), 0);
|
||||
assert(argsBufferSize() == amd::alignUp(signature.paramsSize(), 16) &&
|
||||
"A mismatch of sizes of arguments between compiler and runtime!");
|
||||
|
||||
#if defined(WITH_LIGHTNING_COMPILER)
|
||||
// Check there is no arguments' buffer overflow. We may not use all the
|
||||
// hidden argument slots.
|
||||
assert(aqlArgBuf <= (gpu.cb(0)->SysMemCopy() + argsBufferSize()));
|
||||
#else // !defined(WITH_LIGHTNING_COMPILER)
|
||||
// HSAIL kernarg segment size is rounded up to multiple of 16.
|
||||
aqlArgBuf = amd::alignUp(aqlArgBuf, 16);
|
||||
assert((aqlArgBuf == (gpu.cb(0)->SysMemCopy() + argsBufferSize())) &&
|
||||
"Size and the number of arguments don't match!");
|
||||
#endif // !defined(WITH_LIGHTNING_COMPILER)
|
||||
hsa_kernel_dispatch_packet_t* hsaDisp =
|
||||
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(gpu.cb(0)->SysMemCopy() + argsBufferSize());
|
||||
//hsa_kernel_dispatch_packet_t disp;
|
||||
hsa_kernel_dispatch_packet_t* hsaDisp = reinterpret_cast<hsa_kernel_dispatch_packet_t*>(
|
||||
gpu.cb(0)->SysMemCopy());
|
||||
|
||||
amd::NDRange local(sizes.local());
|
||||
const amd::NDRange& global = sizes.global();
|
||||
@@ -1117,6 +1015,12 @@ hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments(
|
||||
// Check if runtime has to find local workgroup size
|
||||
findLocalWorkSize(sizes.dimensions(), sizes.global(), local);
|
||||
|
||||
constexpr uint16_t kDispatchPacketHeader =
|
||||
(HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
|
||||
(1 << HSA_PACKET_HEADER_BARRIER) |
|
||||
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
|
||||
(HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
|
||||
|
||||
hsaDisp->header = kDispatchPacketHeader;
|
||||
hsaDisp->setup = sizes.dimensions();
|
||||
|
||||
@@ -1134,28 +1038,16 @@ hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments(
|
||||
hsaDisp->group_segment_size = ldsAddress - ldsSize();
|
||||
hsaDisp->kernel_object = gpuAqlCode();
|
||||
|
||||
const ConstantBuffer* cb = gpu.cb(0);
|
||||
uint64_t argList = cb->UploadDataToHw(
|
||||
argsBufferSize() + sizeof(hsa_kernel_dispatch_packet_t));
|
||||
|
||||
hsaDisp->kernarg_address = reinterpret_cast<void*>(argList);
|
||||
hsaDisp->reserved2 = 0;
|
||||
hsaDisp->completion_signal.handle = 0;
|
||||
memcpy(aqlArgBuf + argsBufferSize(), hsaDisp, sizeof(hsa_kernel_dispatch_packet_t));
|
||||
|
||||
gpu.addVmMemory(cb->ActiveMemory());
|
||||
gpu.addVmMemory(&prog().codeSegGpu());
|
||||
for (pal::Memory* mem : prog().globalStores()) {
|
||||
gpu.addVmMemory(mem);
|
||||
}
|
||||
if (AMD_HSA_BITS_GET(cpuAqlCode_->kernel_code_properties,
|
||||
AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_QUEUE_PTR)) {
|
||||
AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_QUEUE_PTR)) {
|
||||
gpu.addVmMemory(gpu.hsaQueueMem());
|
||||
}
|
||||
|
||||
if (srdResource || prog().isStaticSampler()) {
|
||||
dev().srds().fillResourceList(gpu);
|
||||
}
|
||||
|
||||
return hsaDisp;
|
||||
}
|
||||
|
||||
@@ -1398,6 +1290,8 @@ static inline cl_kernel_arg_type_qualifier GetOclTypeQual(const KernelArgMD& lcA
|
||||
|
||||
void LightningKernel::initArgList(const KernelMD& kernelMD) {
|
||||
device::Kernel::parameters_t params;
|
||||
device::Kernel::parameters_t hiddenParams;
|
||||
size_t offsetStruct = argsBufferSize();
|
||||
|
||||
size_t offset = 0;
|
||||
|
||||
@@ -1426,20 +1320,27 @@ void LightningKernel::initArgList(const KernelMD& kernelMD) {
|
||||
|
||||
arg->index_ = isHidden ? uint(-1) : params.size();
|
||||
arguments_.push_back(arg);
|
||||
|
||||
if (isHidden) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Initialize Device kernel parameters
|
||||
amd::KernelParameterDescriptor desc;
|
||||
|
||||
if (isHidden) {
|
||||
offset = amd::alignUp(offset, arguments_[i]->alignment_);
|
||||
desc.offset_ = offset;
|
||||
desc.size_ = arguments_[i]->size_;
|
||||
offset += arguments_[i]->size_;
|
||||
desc.info_.oclObject_ = GetOclArgumentType(arguments_[i]);
|
||||
hiddenParams.push_back(desc);
|
||||
continue;
|
||||
}
|
||||
|
||||
desc.name_ = lcArg.mName.c_str();
|
||||
desc.type_ = GetOclType(arg);
|
||||
desc.addressQualifier_ = GetOclAddrQual(arg);
|
||||
desc.accessQualifier_ = GetOclAccessQual(arg);
|
||||
desc.typeQualifier_ = GetOclTypeQual(lcArg);
|
||||
desc.typeName_ = lcArg.mTypeName.c_str();
|
||||
desc.info_.oclObject_ = GetOclArgumentType(arg);
|
||||
desc.info_.arrayIndex_ = arg->pointeeAlignment_;
|
||||
|
||||
// Make a check if it is local or global
|
||||
if (desc.addressQualifier_ == CL_KERNEL_ARG_ADDRESS_LOCAL) {
|
||||
@@ -1456,14 +1357,37 @@ void LightningKernel::initArgList(const KernelMD& kernelMD) {
|
||||
// Local memory for CPU
|
||||
size = sizeof(cl_mem);
|
||||
}
|
||||
offset = (size_t)amd::alignUp(offset, std::min(size, size_t(16)));
|
||||
desc.offset_ = offset;
|
||||
offset += amd::alignUp(size, sizeof(uint32_t));
|
||||
// Check if HSAIL expects data by reference and allocate it behind
|
||||
if (arguments_[i]->type_ == HSAIL_ARGTYPE_REFERENCE) {
|
||||
desc.offset_ = offsetStruct;
|
||||
// Align the offset reference
|
||||
offset = amd::alignUp(offset, sizeof(size_t));
|
||||
patchReferences_.insert({ desc.offset_, offset });
|
||||
offsetStruct += size;
|
||||
// Adjust the offset of arguments
|
||||
offset += sizeof(size_t);
|
||||
}
|
||||
else {
|
||||
// These objects have forced data size to uint64_t
|
||||
if ((desc.info_.oclObject_ == amd::KernelParameterDescriptor::ImageObject) ||
|
||||
(desc.info_.oclObject_ == amd::KernelParameterDescriptor::SamplerObject) ||
|
||||
(desc.info_.oclObject_ == amd::KernelParameterDescriptor::QueueObject)) {
|
||||
offset = amd::alignUp(offset, sizeof(uint64_t));
|
||||
desc.offset_ = offset;
|
||||
offset += sizeof(uint64_t);
|
||||
} else {
|
||||
offset = amd::alignUp(offset, arguments_[i]->alignment_);
|
||||
desc.offset_ = offset;
|
||||
offset += size;
|
||||
}
|
||||
}
|
||||
// Update read only flag
|
||||
desc.info_.readOnly_ = (arguments_[i]->access_ == HSAIL_ACCESS_TYPE_RO) ? true : false;
|
||||
|
||||
params.push_back(desc);
|
||||
}
|
||||
|
||||
createSignature(params);
|
||||
createSignature(params, hiddenParams, amd::KernelSignature::ABIVersion_1);
|
||||
}
|
||||
|
||||
static const KernelMD* FindKernelMetadata(const CodeObjectMD* programMD, const std::string& name) {
|
||||
|
||||
@@ -182,7 +182,7 @@ class HSAILKernel : public device::Kernel {
|
||||
const amd::Kernel& kernel, //!< AMD kernel object
|
||||
const amd::NDRangeContainer& sizes, //!< NDrange container
|
||||
const_address parameters, //!< Application arguments for the kernel
|
||||
bool nativeMem, //!< Native memory objects are passed
|
||||
size_t ldsAddress, //!< LDS address that includes all arguments.
|
||||
uint64_t vmDefQueue, //!< GPU VM default queue pointer
|
||||
uint64_t* vmParentWrap //!< GPU VM parent aql wrap object
|
||||
) const;
|
||||
@@ -204,6 +204,8 @@ class HSAILKernel : public device::Kernel {
|
||||
return waveLimiter_.getWavesPerSH(vdev);
|
||||
};
|
||||
|
||||
const std::unordered_map<size_t, size_t>& patch() const { return patchReferences_; }
|
||||
|
||||
private:
|
||||
//! Disable copy constructor
|
||||
HSAILKernel(const HSAILKernel&);
|
||||
@@ -234,6 +236,7 @@ class HSAILKernel : public device::Kernel {
|
||||
const HSAILProgram& prog_; //!< Reference to the parent program
|
||||
std::vector<PrintfInfo> printf_; //!< Format strings for GPU printf support
|
||||
uint index_; //!< Kernel index in the program
|
||||
std::unordered_map<size_t, size_t> patchReferences_; //!< Patch table for references
|
||||
|
||||
uint64_t code_; //!< GPU memory pointer to the kernel
|
||||
size_t codeSize_; //!< Size of ISA code
|
||||
|
||||
@@ -219,6 +219,8 @@ class Image : public pal::Memory {
|
||||
size_t* slicePitch = NULL //!< Slice for the mapped memory
|
||||
);
|
||||
|
||||
virtual uint64_t virtualAddress() const override { return hwSrd(); }
|
||||
|
||||
private:
|
||||
//! Disable copy constructor
|
||||
Image(const Image&);
|
||||
|
||||
@@ -461,9 +461,8 @@ void VirtualGPU::MemoryDependency::validate(VirtualGPU& gpu, const Memory* memor
|
||||
if (flushL1Cache) {
|
||||
// Flush cache
|
||||
if (!gpu.profiling()) {
|
||||
gpu.addBarrier();
|
||||
gpu.addBarrier();
|
||||
}
|
||||
|
||||
// Clear memory dependency state
|
||||
const static bool All = true;
|
||||
clear(!All);
|
||||
@@ -2112,13 +2111,12 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
size_t ldsSize;
|
||||
// Check memory dependency and SVM objects
|
||||
if (!processMemObjectsHSA(kernel, parameters, nativeMem)) {
|
||||
if (!processMemObjectsHSA(kernel, parameters, nativeMem, ldsSize)) {
|
||||
LogError("Wrong memory objects!");
|
||||
return false;
|
||||
}
|
||||
|
||||
bool needFlush = false;
|
||||
// Avoid flushing when PerfCounter is enabled, to make sure PerfStart/dispatch/PerfEnd
|
||||
// are in the same cmdBuffer
|
||||
@@ -2194,7 +2192,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
|
||||
uint64_t vmParentWrap = 0;
|
||||
// Program the kernel arguments for the GPU execution
|
||||
hsa_kernel_dispatch_packet_t* aqlPkt = hsaKernel.loadArguments(
|
||||
*this, kernel, tmpSizes, parameters, nativeMem, vmDefQueue, &vmParentWrap);
|
||||
*this, kernel, tmpSizes, parameters, ldsSize, vmDefQueue, &vmParentWrap);
|
||||
if (nullptr == aqlPkt) {
|
||||
LogError("Couldn't load kernel arguments");
|
||||
return false;
|
||||
@@ -2948,7 +2946,7 @@ void VirtualGPU::profileEvent(EngineType engine, bool type) const {
|
||||
}
|
||||
|
||||
bool VirtualGPU::processMemObjectsHSA(const amd::Kernel& kernel, const_address params,
|
||||
bool nativeMem) {
|
||||
bool nativeMem, size_t& ldsAddress) {
|
||||
const amd::KernelParameters& kernelParams = kernel.parameters();
|
||||
|
||||
// Mark the tracker with a new kernel,
|
||||
@@ -3015,68 +3013,155 @@ bool VirtualGPU::processMemObjectsHSA(const amd::Kernel& kernel, const_address p
|
||||
}
|
||||
}
|
||||
|
||||
bool srdResource = false;
|
||||
amd::Memory* const* memories =
|
||||
reinterpret_cast<amd::Memory* const*>(params + kernelParams.memoryObjOffset());
|
||||
const HSAILKernel& hsaKernel =
|
||||
static_cast<const HSAILKernel&>(*(kernel.getDeviceKernel(dev())));
|
||||
const amd::KernelSignature& signature = kernel.signature();
|
||||
ldsAddress = hsaKernel.ldsSize();
|
||||
|
||||
// Check all parameters for the current kernel
|
||||
for (size_t i = 0; i < signature.numParameters(); ++i) {
|
||||
const amd::KernelParameterDescriptor& desc = signature.at(i);
|
||||
const HSAILKernel::Argument* arg = hsaKernel.argumentAt(i);
|
||||
|
||||
// Find if current argument is a buffer
|
||||
if ((desc.type_ == T_POINTER) && (arg->addrQual_ != HSAIL_ADDRESS_LOCAL)) {
|
||||
Memory* gpuMem = nullptr;
|
||||
amd::Memory* mem = nullptr;
|
||||
uint32_t index = desc.info_.arrayIndex_;
|
||||
if (nativeMem) {
|
||||
gpuMem = reinterpret_cast<Memory* const*>(memories)[index];
|
||||
if (nullptr != gpuMem) {
|
||||
mem = gpuMem->owner();
|
||||
}
|
||||
} else {
|
||||
mem = memories[index];
|
||||
if (mem != nullptr) {
|
||||
gpuMem = dev().getGpuMemory(mem);
|
||||
// Synchronize data with other memory instances if necessary
|
||||
gpuMem->syncCacheFromHost(*this);
|
||||
}
|
||||
}
|
||||
//! This condition is for SVM fine-grain
|
||||
if ((gpuMem == nullptr) && dev().isFineGrainedSystem(true)) {
|
||||
addBarrier();
|
||||
// Clear memory dependency state
|
||||
const static bool All = true;
|
||||
memoryDependency().clear(!All);
|
||||
continue;
|
||||
} else if (gpuMem != nullptr) {
|
||||
// Check image
|
||||
bool readOnly = (desc.accessQualifier_ == CL_KERNEL_ARG_ACCESS_READ_ONLY) ? true : false;
|
||||
// Check buffer
|
||||
readOnly |= (arg->access_ == HSAIL_ACCESS_TYPE_RO) ? true : false;
|
||||
// Validate memory for a dependency in the queue
|
||||
memoryDependency().validate(*this, gpuMem, readOnly);
|
||||
|
||||
// Wait for resource if it was used on an inactive engine
|
||||
//! \note syncCache may call DRM transfer
|
||||
constexpr bool WaitOnBusyEngine = true;
|
||||
gpuMem->wait(*this, WaitOnBusyEngine);
|
||||
|
||||
//! Check if compiler expects read/write
|
||||
if ((mem != nullptr) && !desc.info_.readOnly_) {
|
||||
mem->signalWrite(&dev());
|
||||
}
|
||||
addVmMemory(gpuMem);
|
||||
if (!nativeMem) {
|
||||
// Process cache coherency first, since the extra transfers may affect
|
||||
// other mem dependency tracking logic: TS and signalWrite()
|
||||
for (uint i = 0; i < signature.numMemories(); ++i) {
|
||||
amd::Memory* mem = memories[i];
|
||||
if (mem != nullptr) {
|
||||
// Synchronize data with other memory instances if necessary
|
||||
dev().getGpuMemory(mem)->syncCacheFromHost(*this);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (pal::Memory* mem : hsaKernel.prog().globalStores()) {
|
||||
// Check all parameters for the current kernel
|
||||
for (size_t i = 0; i < signature.numParameters(); ++i) {
|
||||
const amd::KernelParameterDescriptor& desc = signature.at(i);
|
||||
const amd::KernelParameterDescriptor::InfoData& info = desc.info_;
|
||||
|
||||
// Find if current argument is a buffer
|
||||
if (desc.type_ == T_POINTER) {
|
||||
// If it is a local pointer
|
||||
if (desc.size_ == 0) {
|
||||
ldsAddress = amd::alignUp(ldsAddress, desc.info_.arrayIndex_);
|
||||
// Save the original LDS size
|
||||
size_t ldsSize = *reinterpret_cast<const size_t*>(params + desc.offset_);
|
||||
// Patch the LDS address in the original arguments with an LDS address(offset)
|
||||
WriteAqlArgAt(const_cast<address>(params), &ldsAddress, sizeof(void*), desc.offset_);
|
||||
// Add the original size
|
||||
ldsAddress += ldsSize;
|
||||
} else {
|
||||
Memory* gpuMem = nullptr;
|
||||
amd::Memory* mem = nullptr;
|
||||
uint32_t index = info.arrayIndex_;
|
||||
if (nativeMem) {
|
||||
gpuMem = reinterpret_cast<Memory* const*>(memories)[index];
|
||||
if (nullptr != gpuMem) {
|
||||
mem = gpuMem->owner();
|
||||
}
|
||||
} else {
|
||||
mem = memories[index];
|
||||
if (mem != nullptr) {
|
||||
gpuMem = dev().getGpuMemory(mem);
|
||||
}
|
||||
}
|
||||
//! This condition is for SVM fine-grain
|
||||
if ((gpuMem == nullptr) && dev().isFineGrainedSystem(true)) {
|
||||
addBarrier();
|
||||
// Clear memory dependency state
|
||||
const static bool All = true;
|
||||
memoryDependency().clear(!All);
|
||||
continue;
|
||||
} else if (gpuMem != nullptr) {
|
||||
// Validate memory for a dependency in the queue
|
||||
memoryDependency().validate(*this, gpuMem, info.readOnly_);
|
||||
// Wait for resource if it was used on an inactive engine
|
||||
//! \note syncCache may call DRM transfer
|
||||
constexpr bool WaitOnBusyEngine = true;
|
||||
gpuMem->wait(*this, WaitOnBusyEngine);
|
||||
|
||||
addVmMemory(gpuMem);
|
||||
|
||||
//! Check if compiler expects read/write.
|
||||
//! Note: SVM with subbuffers has an issue with tracking.
|
||||
//! Conformance can send read only subbuffer, but update the region
|
||||
//! in the kernel.
|
||||
if ((mem != nullptr) &&
|
||||
((!info.readOnly_ && (mem->getSvmPtr() == nullptr)) ||
|
||||
((mem->getMemFlags() & CL_MEM_READ_ONLY) == 0))) {
|
||||
mem->signalWrite(&dev());
|
||||
}
|
||||
if (info.oclObject_ == amd::KernelParameterDescriptor::ImageObject) {
|
||||
//! \note Special case for the image views.
|
||||
//! Copy SRD to CB1, so blit manager will be able to release
|
||||
//! this view without a wait for SRD resource.
|
||||
if (gpuMem->memoryType() == Resource::ImageView) {
|
||||
// Copy the current image SRD into CB1
|
||||
uint64_t srd = cb(1)->UploadDataToHw(gpuMem->hwState(), HsaImageObjectSize);
|
||||
// Then use a pointer in aqlArgBuffer to CB1
|
||||
// Patch the GPU VA address in the original arguments
|
||||
WriteAqlArgAt(const_cast<address>(params), &srd, sizeof(srd), desc.offset_);
|
||||
addVmMemory(cb(1)->ActiveMemory());
|
||||
} else {
|
||||
srdResource = true;
|
||||
}
|
||||
if (gpuMem->desc().isDoppTexture_) {
|
||||
addDoppRef(gpuMem, kernel.parameters().getExecNewVcop(),
|
||||
kernel.parameters().getExecPfpaVcop());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (desc.type_ == T_VOID) {
|
||||
if (desc.info_.oclObject_ == amd::KernelParameterDescriptor::ReferenceObject) {
|
||||
// Copy the current structure into CB1
|
||||
size_t gpuPtr = static_cast<size_t>(cb(1)->UploadDataToHw(params, desc.size_));
|
||||
// Then use a pointer in aqlArgBuffer to CB1
|
||||
const auto it = hsaKernel.patch().find(desc.offset_);
|
||||
// Patch the GPU VA address in the original arguments
|
||||
WriteAqlArgAt(const_cast<address>(params), &gpuPtr, sizeof(size_t), it->second);
|
||||
addVmMemory(cb(1)->ActiveMemory());
|
||||
}
|
||||
}
|
||||
else if (desc.type_ == T_SAMPLER) {
|
||||
srdResource = true;
|
||||
} else if (desc.type_ == T_QUEUE) {
|
||||
uint32_t index = desc.info_.arrayIndex_;
|
||||
const amd::DeviceQueue* queue = reinterpret_cast<amd::DeviceQueue* const*>(
|
||||
params + kernelParams.queueObjOffset())[index];
|
||||
VirtualGPU* gpuQueue = static_cast<VirtualGPU*>(queue->vDev());
|
||||
uint64_t vmQueue;
|
||||
if (dev().settings().useDeviceQueue_) {
|
||||
vmQueue = gpuQueue->vQueue()->vmAddress();
|
||||
} else {
|
||||
if (!createVirtualQueue(queue->size())) {
|
||||
LogError("Virtual queue creation failed!");
|
||||
return false;
|
||||
}
|
||||
vmQueue = vQueue()->vmAddress();
|
||||
}
|
||||
// Patch the GPU VA address in the original arguments
|
||||
WriteAqlArgAt(const_cast<address>(params), &vmQueue, sizeof(vmQueue), desc.offset_);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (ldsAddress > dev().info().localMemSize_) {
|
||||
LogError("No local memory available\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
if (srdResource || hsaKernel.prog().isStaticSampler()) {
|
||||
dev().srds().fillResourceList(*this);
|
||||
}
|
||||
|
||||
addVmMemory(&hsaKernel.prog().codeSegGpu());
|
||||
|
||||
for (const pal::Memory* mem : hsaKernel.prog().globalStores()) {
|
||||
const static bool IsReadOnly = false;
|
||||
// Validate global store for a dependency in the queue
|
||||
memoryDependency().validate(*this, mem, IsReadOnly);
|
||||
addVmMemory(mem);
|
||||
}
|
||||
|
||||
return true;
|
||||
|
||||
@@ -378,6 +378,9 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
//! Return xfer buffer for staging operations
|
||||
XferBuffer& xferWrite() { return writeBuffer_; }
|
||||
|
||||
//! Return managed buffer for staging operations
|
||||
ManagedBuffer& managedBuffer() { return managedBuffer_; }
|
||||
|
||||
//! Adds a pinned memory object into a map
|
||||
void addPinnedMem(amd::Memory* mem);
|
||||
|
||||
@@ -529,7 +532,8 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
//! Detects memory dependency for HSAIL kernels and flushes caches
|
||||
bool processMemObjectsHSA(const amd::Kernel& kernel, //!< AMD kernel object for execution
|
||||
const_address params, //!< Pointer to the param's store
|
||||
bool nativeMem //!< Native memory objects
|
||||
bool nativeMem, //!< Native memory objects
|
||||
size_t& ldsAddess //!< Returns LDS size, used in the kernel
|
||||
);
|
||||
|
||||
//! Common function for fill memory used by both svm Fill and non-svm fill
|
||||
@@ -644,4 +648,33 @@ uint VirtualGPU::Queue::submit(bool forceFlush) {
|
||||
return id;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline void WriteAqlArgAt(
|
||||
unsigned char* dst, //!< The write pointer to the buffer
|
||||
const T* src, //!< The source pointer
|
||||
uint size, //!< The size in bytes to copy
|
||||
size_t offset //!< The alignment to follow while writing to the buffer
|
||||
) {
|
||||
memcpy(dst + offset, src, size);
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void WriteAqlArgAt(
|
||||
unsigned char* dst, //!< The write pointer to the buffer
|
||||
const uint32_t* src, //!< The source pointer
|
||||
uint size, //!< The size in bytes to copy
|
||||
size_t offset //!< The alignment to follow while writing to the buffer
|
||||
) {
|
||||
*(reinterpret_cast<uint32_t*>(dst + offset)) = *src;
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void WriteAqlArgAt(
|
||||
unsigned char* dst, //!< The write pointer to the buffer
|
||||
const uint64_t* src, //!< The source pointer
|
||||
uint size, //!< The size in bytes to copy
|
||||
size_t offset //!< The alignment to follow while writing to the buffer
|
||||
) {
|
||||
*(reinterpret_cast<uint64_t*>(dst + offset)) = *src;
|
||||
}
|
||||
/*@}*/} // namespace pal
|
||||
|
||||
@@ -581,7 +581,8 @@ void HSAILKernel::initArguments(const aclArgData* aclArg) {
|
||||
|
||||
params.push_back(desc);
|
||||
}
|
||||
createSignature(params);
|
||||
device::Kernel::parameters_t hiddenParams;
|
||||
createSignature(params, hiddenParams, amd::KernelSignature::ABIVersion_0);
|
||||
}
|
||||
#endif // defined(WITH_COMPILER_LIB)
|
||||
|
||||
@@ -660,8 +661,8 @@ void LightningKernel::initArguments(const KernelMD& kernelMD) {
|
||||
|
||||
params.push_back(desc);
|
||||
}
|
||||
|
||||
createSignature(params);
|
||||
device::Kernel::parameters_t hiddenParams;
|
||||
createSignature(params, hiddenParams, amd::KernelSignature::ABIVersion_0);
|
||||
}
|
||||
#endif // defined(WITH_LIGHTNING_COMPILER)
|
||||
|
||||
|
||||
@@ -243,13 +243,17 @@ void KernelParameters::release(address mem, const amd::Device& device) const {
|
||||
}
|
||||
|
||||
KernelSignature::KernelSignature(const std::vector<KernelParameterDescriptor>& params,
|
||||
const std::string& attrib)
|
||||
const std::string& attrib,
|
||||
const std::vector<KernelParameterDescriptor>& hiddenParams,
|
||||
uint32_t version)
|
||||
: params_(params)
|
||||
, hiddenParams_(hiddenParams)
|
||||
, attributes_(attrib)
|
||||
, paramsSize_(0)
|
||||
, numMemories_(0)
|
||||
, numSamplers_(0)
|
||||
, numQueues_(0) {
|
||||
, numQueues_(0)
|
||||
, version_(version) {
|
||||
size_t maxOffset = 0;
|
||||
size_t last = 0;
|
||||
// Find the last entry
|
||||
@@ -283,7 +287,15 @@ KernelSignature::KernelSignature(const std::vector<KernelParameterDescriptor>& p
|
||||
if (lastSize == 0 /* local mem */) {
|
||||
lastSize = sizeof(cl_mem);
|
||||
}
|
||||
paramsSize_ = params[last].offset_ + alignUp(lastSize, sizeof(intptr_t));
|
||||
// Note: It's a special case. HW ABI expects 64 bit for SRD, regardless of the binary.
|
||||
// Force the size to 64 bit for those cases.
|
||||
if ((params[last].info_.oclObject_ == amd::KernelParameterDescriptor::ImageObject) ||
|
||||
(params[last].info_.oclObject_ == amd::KernelParameterDescriptor::SamplerObject) ||
|
||||
(params[last].info_.oclObject_ == amd::KernelParameterDescriptor::QueueObject)) {
|
||||
lastSize = alignUp(lastSize, sizeof(uint64_t));
|
||||
}
|
||||
paramsSize_ = params[last].offset_ + lastSize;
|
||||
paramsSize_ = alignUp(paramsSize_, sizeof(intptr_t));
|
||||
}
|
||||
}
|
||||
} // namespace amd
|
||||
|
||||
@@ -36,18 +36,30 @@ class Program;
|
||||
class KernelSignature : public HeapObject {
|
||||
private:
|
||||
std::vector<KernelParameterDescriptor> params_;
|
||||
std::vector<KernelParameterDescriptor> hiddenParams_;
|
||||
std::string attributes_; //!< The kernel attributes
|
||||
uint32_t paramsSize_;
|
||||
uint32_t numMemories_;
|
||||
uint32_t numSamplers_;
|
||||
uint32_t numQueues_;
|
||||
uint32_t version_;
|
||||
|
||||
public:
|
||||
enum {
|
||||
ABIVersion_0 = 0, //! ABI constructed based on the OCL semantics
|
||||
ABIVersion_1 = 1 //! ABI constructed based on the HW ABI returned from the compiler
|
||||
};
|
||||
|
||||
//! Default constructor
|
||||
KernelSignature() : paramsSize_(0), numMemories_(0), numSamplers_(0), numQueues_(0) {}
|
||||
KernelSignature():
|
||||
paramsSize_(0), numMemories_(0), numSamplers_(0),
|
||||
numQueues_(0), version_(ABIVersion_0) {}
|
||||
|
||||
//! Construct a new signature.
|
||||
KernelSignature(const std::vector<KernelParameterDescriptor>& params, const std::string& attrib);
|
||||
KernelSignature(const std::vector<KernelParameterDescriptor>& params,
|
||||
const std::string& attrib,
|
||||
const std::vector<KernelParameterDescriptor>& hiddenParams,
|
||||
uint32_t version);
|
||||
|
||||
//! Return the number of parameters
|
||||
size_t numParameters() const { return params_.size(); }
|
||||
@@ -72,8 +84,17 @@ class KernelSignature : public HeapObject {
|
||||
//! Returns the number of queue objects.
|
||||
uint32_t numQueues() const { return numQueues_; }
|
||||
|
||||
//! Returns the signature version
|
||||
uint32_t version() const { return version_; }
|
||||
|
||||
//! Return the kernel attributes
|
||||
const std::string& attributes() const { return attributes_; }
|
||||
|
||||
const std::vector<KernelParameterDescriptor>& hiddenParameters() const
|
||||
{ return hiddenParams_; }
|
||||
|
||||
const std::vector<KernelParameterDescriptor>& parameters() const
|
||||
{ return params_; }
|
||||
};
|
||||
|
||||
// @todo: look into a copy-on-write model instead of copy-on-read.
|
||||
|
||||
@@ -604,8 +604,8 @@ bool Program::ParseAllOptions(const std::string& options, option::Options& parse
|
||||
}
|
||||
|
||||
bool Symbol::setDeviceKernel(const Device& device, const device::Kernel* func) {
|
||||
// FIXME_lmoriche: check that the signatures are compatible
|
||||
if (deviceKernels_.size() == 0) {
|
||||
if (deviceKernels_.size() == 0 ||
|
||||
(func->signature().version() > KernelSignature::ABIVersion_0)) {
|
||||
signature_ = func->signature();
|
||||
}
|
||||
deviceKernels_[&device] = func;
|
||||
|
||||
Reference in New Issue
Block a user