From 665aab7ca4e7699a9926dbb2298ec622e17b385a Mon Sep 17 00:00:00 2001
From: foreman
Date: Tue, 12 Jun 2018 18:57:20 -0400
Subject: [PATCH] P4 to Git Change 1567428 by gandryey@gera-w8 on 2018/06/12
18:39:23
SWDEV-79445 - OCL generic changes and code clean-up
- Optimize setup of kernel arguments. Stage 2.
- Add HW ABI support in the abstraction layer
- Remove arguments parsing loop from the kernel launch. Memory processing will be responsible for dependency tracking and patching of arguments.
http://ocltc.amd.com/reviews/r/15122/
Affected files ...
... //depot/stg/opencl/drivers/opencl/runtime/device/device.cpp#221 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/device.hpp#307 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpukernel.cpp#325 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palblit.cpp#24 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palkernel.cpp#53 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palkernel.hpp#17 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palmemory.hpp#9 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.cpp#107 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.hpp#53 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rockernel.cpp#36 edit
... //depot/stg/opencl/drivers/opencl/runtime/platform/kernel.cpp#30 edit
... //depot/stg/opencl/drivers/opencl/runtime/platform/kernel.hpp#23 edit
... //depot/stg/opencl/drivers/opencl/runtime/platform/program.cpp#95 edit
[ROCm/clr commit: 1be400ff014502df6628e71661894460d18ae458]
---
projects/clr/rocclr/runtime/device/device.cpp | 6 +-
projects/clr/rocclr/runtime/device/device.hpp | 22 +-
.../rocclr/runtime/device/gpu/gpukernel.cpp | 9 +-
.../clr/rocclr/runtime/device/pal/palblit.cpp | 14 +-
.../rocclr/runtime/device/pal/palkernel.cpp | 406 +++++++-----------
.../rocclr/runtime/device/pal/palkernel.hpp | 5 +-
.../rocclr/runtime/device/pal/palmemory.hpp | 2 +
.../rocclr/runtime/device/pal/palvirtual.cpp | 197 ++++++---
.../rocclr/runtime/device/pal/palvirtual.hpp | 35 +-
.../rocclr/runtime/device/rocm/rockernel.cpp | 7 +-
.../clr/rocclr/runtime/platform/kernel.cpp | 18 +-
.../clr/rocclr/runtime/platform/kernel.hpp | 25 +-
.../clr/rocclr/runtime/platform/program.cpp | 4 +-
13 files changed, 430 insertions(+), 320 deletions(-)
diff --git a/projects/clr/rocclr/runtime/device/device.cpp b/projects/clr/rocclr/runtime/device/device.cpp
index 0be2916483..718afc078a 100644
--- a/projects/clr/rocclr/runtime/device/device.cpp
+++ b/projects/clr/rocclr/runtime/device/device.cpp
@@ -600,7 +600,9 @@ Settings::Settings() {
//!< concurrent Virtual GPUs for default
}
-bool Kernel::createSignature(const parameters_t& params) {
+bool Kernel::createSignature(
+ const parameters_t& params, const parameters_t& hiddenParams,
+ uint32_t version) {
std::stringstream attribs;
if (workGroupInfo_.compileSize_[0] != 0) {
attribs << "reqd_work_group_size(";
@@ -632,7 +634,7 @@ bool Kernel::createSignature(const parameters_t& params) {
// Destroy old signature if it was allocated before
// (offline devices path)
delete signature_;
- signature_ = new amd::KernelSignature(params, attribs.str());
+ signature_ = new amd::KernelSignature(params, attribs.str(), hiddenParams, version);
if (NULL != signature_) {
return true;
}
diff --git a/projects/clr/rocclr/runtime/device/device.hpp b/projects/clr/rocclr/runtime/device/device.hpp
index 1b3d33f25a..4c32821207 100644
--- a/projects/clr/rocclr/runtime/device/device.hpp
+++ b/projects/clr/rocclr/runtime/device/device.hpp
@@ -852,7 +852,9 @@ class Kernel : public amd::HeapObject {
const std::string& name() const { return name_; }
//! Initializes the kernel parameters for the abstraction layer
- bool createSignature(const parameters_t& params);
+ bool createSignature(
+ const parameters_t& params, const parameters_t& hiddenParams,
+ uint32_t version);
//! Returns TRUE if it's a HSA kernel
bool hsa() const { return hsa_; }
@@ -1624,6 +1626,22 @@ class Device : public RuntimeObject {
};
struct KernelParameterDescriptor {
+ enum {
+ Value = 0,
+ HiddenNone = 1,
+ HiddenGlobalOffsetX = 2,
+ HiddenGlobalOffsetY = 3,
+ HiddenGlobalOffsetZ = 4,
+ HiddenPrintfBuffer = 5,
+ HiddenDefaultQueue = 6,
+ HiddenCompletionAction = 7,
+ MemoryObject = 8,
+ ReferenceObject = 9,
+ ValueObject = 10,
+ ImageObject = 11,
+ SamplerObject = 12,
+ QueueObject = 13
+ };
const char* name_; //!< The parameter's name in the source
clk_value_type_t type_; //!< The parameter's type
size_t offset_; //!< Its offset in the parameter's stack
@@ -1642,7 +1660,7 @@ struct KernelParameterDescriptor {
uint32_t rawPointer_ : 1; //!< Arguments have a raw GPU VA
uint32_t defined_ : 1; //!< The argument was defined by the app
uint32_t reserved_ : 1; //!< reserved
- uint32_t arrayIndex_ : 28; //!< Index in the objects array
+ uint32_t arrayIndex_ : 24; //!< Index in the objects array or LDS alignment
};
uint32_t allValues_;
InfoData() : allValues_(0) {}
diff --git a/projects/clr/rocclr/runtime/device/gpu/gpukernel.cpp b/projects/clr/rocclr/runtime/device/gpu/gpukernel.cpp
index e49fe8b63b..09c911022e 100644
--- a/projects/clr/rocclr/runtime/device/gpu/gpukernel.cpp
+++ b/projects/clr/rocclr/runtime/device/gpu/gpukernel.cpp
@@ -752,7 +752,8 @@ bool NullKernel::create(const std::string& code, const std::string& metadata,
workGroupInfo_.usedStackSize_ = calFuncInfo.stackSizeUsed;
device::Kernel::parameters_t params;
- if (!createSignature(params)) {
+ device::Kernel::parameters_t hiddenParams;
+ if (!createSignature(params, hiddenParams, amd::KernelSignature::ABIVersion_0)) {
return false;
}
@@ -1337,7 +1338,8 @@ bool Kernel::initParameters() {
workGroupInfo_.localMemSize_ = hwLocalSize_;
}
- if (!createSignature(params)) {
+ device::Kernel::parameters_t hiddenParams;
+ if (!createSignature(params, hiddenParams, amd::KernelSignature::ABIVersion_0)) {
return false;
}
@@ -3017,7 +3019,8 @@ void HSAILKernel::initArgList(const aclArgData* aclArg) {
}
}
- createSignature(params);
+ device::Kernel::parameters_t hiddenParams;
+ createSignature(params, hiddenParams, amd::KernelSignature::ABIVersion_0);
}
void HSAILKernel::initHsailArgs(const aclArgData* aclArg) {
diff --git a/projects/clr/rocclr/runtime/device/pal/palblit.cpp b/projects/clr/rocclr/runtime/device/pal/palblit.cpp
index b8845ac7f6..f64af9d2cf 100644
--- a/projects/clr/rocclr/runtime/device/pal/palblit.cpp
+++ b/projects/clr/rocclr/runtime/device/pal/palblit.cpp
@@ -943,24 +943,30 @@ static void setArgument(amd::Kernel* kernel, size_t index, size_t size, const vo
uint32_t uint32_value = 0;
uint64_t uint64_value = 0;
+ size_t argSize = desc.size_;
if (desc.type_ == T_POINTER && desc.size_ != 0) {
if ((value == NULL) || (static_cast(value) == NULL)) {
- LP64_SWITCH(uint32_value, uint64_value) = 0;
reinterpret_cast(kernel->parameters().values() +
kernel->parameters().memoryObjOffset())[desc.info_.arrayIndex_] = nullptr;
} else {
// convert cl_mem to amd::Memory*, return false if invalid.
LP64_SWITCH(uint32_value, uint64_value) = static_cast((
- *static_cast(value))->vmAddress());
+ *static_cast(value))->virtualAddress());
reinterpret_cast(kernel->parameters().values() +
kernel->parameters().memoryObjOffset())[desc.info_.arrayIndex_] =
*static_cast(value);
+ // Note: Special case for image SRD, which is 64 bit always
+ if (LP64_SWITCH(true, false) &&
+ (desc.info_.oclObject_ == amd::KernelParameterDescriptor::ImageObject)) {
+ uint64_value = uint32_value;
+ argSize = sizeof(uint64_t);
+ }
}
} else if (desc.type_ == T_SAMPLER) {
assert(false && "No sampler support in blit manager! Use internal samplers!");
} else
- switch (desc.size_) {
+ switch (argSize) {
case 1:
uint32_value = *static_cast(value);
break;
@@ -977,7 +983,7 @@ static void setArgument(amd::Kernel* kernel, size_t index, size_t size, const vo
break;
}
- switch (desc.size_) {
+ switch (argSize) {
case 0 /*local mem*/:
*static_cast(param) = size;
break;
diff --git a/projects/clr/rocclr/runtime/device/pal/palkernel.cpp b/projects/clr/rocclr/runtime/device/pal/palkernel.cpp
index c2a2be0c01..f181218413 100644
--- a/projects/clr/rocclr/runtime/device/pal/palkernel.cpp
+++ b/projects/clr/rocclr/runtime/device/pal/palkernel.cpp
@@ -228,6 +228,37 @@ inline static int GetHSAILArgSize(const aclArgData* argInfo) {
}
}
+inline static uint32_t GetOclArgumentType(const HSAILKernel::Argument* arg) {
+ switch (arg->type_){
+ case HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_X:
+ return amd::KernelParameterDescriptor::HiddenGlobalOffsetX;
+ case HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Y:
+ return amd::KernelParameterDescriptor::HiddenGlobalOffsetY;
+ case HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Z:
+ return amd::KernelParameterDescriptor::HiddenGlobalOffsetZ;
+ case HSAIL_ARGTYPE_HIDDEN_PRINTF_BUFFER:
+ return amd::KernelParameterDescriptor::HiddenPrintfBuffer;
+ case HSAIL_ARGTYPE_HIDDEN_DEFAULT_QUEUE:
+ return amd::KernelParameterDescriptor::HiddenDefaultQueue;
+ case HSAIL_ARGTYPE_HIDDEN_COMPLETION_ACTION:
+ return amd::KernelParameterDescriptor::HiddenCompletionAction;
+ case HSAIL_ARGTYPE_POINTER:
+ return amd::KernelParameterDescriptor::MemoryObject;
+ case HSAIL_ARGTYPE_IMAGE:
+ return amd::KernelParameterDescriptor::ImageObject;
+ case HSAIL_ARGTYPE_REFERENCE:
+ return amd::KernelParameterDescriptor::ReferenceObject;
+ case HSAIL_ARGTYPE_VALUE:
+ return amd::KernelParameterDescriptor::ValueObject;
+ case HSAIL_ARGTYPE_SAMPLER:
+ return amd::KernelParameterDescriptor::SamplerObject;
+ case HSAIL_ARGTYPE_QUEUE:
+ return amd::KernelParameterDescriptor::QueueObject;
+ default:
+ return amd::KernelParameterDescriptor::HiddenNone;
+ }
+}
+
inline static clk_value_type_t GetOclType(const HSAILKernel::Argument* arg) {
static const clk_value_type_t ClkValueMapType[6][6] = {
{T_CHAR, T_CHAR2, T_CHAR3, T_CHAR4, T_CHAR8, T_CHAR16},
@@ -422,12 +453,22 @@ void HSAILKernel::initArgList(const aclArgData* aclArg) {
// Iterate through the arguments and insert into parameterList
device::Kernel::parameters_t params;
+ device::Kernel::parameters_t hiddenParams;
amd::KernelParameterDescriptor desc;
size_t offset = 0;
+ size_t offsetStruct = argsBufferSize();
for (uint i = 0; aclArg->struct_size != 0; i++, aclArg++) {
- // skip the hidden arguments
- if (arguments_[i]->index_ == uint(-1)) continue;
+ // Allocate the hidden arguments, but abstraction layer will skip them
+ if (arguments_[i]->index_ == uint(-1)) {
+ offset = amd::alignUp(offset, arguments_[i]->alignment_);
+ desc.offset_ = offset;
+ desc.size_ = arguments_[i]->size_;
+ offset += arguments_[i]->size_;
+ desc.info_.oclObject_ = GetOclArgumentType(arguments_[i]);
+ hiddenParams.push_back(desc);
+ continue;
+ }
desc.name_ = arguments_[i]->name_.c_str();
desc.type_ = GetOclType(arguments_[i]);
@@ -435,6 +476,8 @@ void HSAILKernel::initArgList(const aclArgData* aclArg) {
desc.accessQualifier_ = GetOclAccessQual(arguments_[i]);
desc.typeQualifier_ = GetOclTypeQual(aclArg);
desc.typeName_ = arguments_[i]->typeName_.c_str();
+ desc.info_.oclObject_ = GetOclArgumentType(arguments_[i]);
+ desc.info_.arrayIndex_ = arguments_[i]->pointeeAlignment_;
// Make a check if it is local or global
if (desc.addressQualifier_ == CL_KERNEL_ARG_ADDRESS_LOCAL) {
@@ -451,9 +494,32 @@ void HSAILKernel::initArgList(const aclArgData* aclArg) {
// Local memory for CPU
size = sizeof(cl_mem);
}
- offset = amd::alignUp(offset, std::min(size, size_t(16)));
- desc.offset_ = offset;
- offset += amd::alignUp(size, sizeof(uint32_t));
+ // Check if HSAIL expects data by reference and allocate it behind
+ if (arguments_[i]->type_ == HSAIL_ARGTYPE_REFERENCE) {
+ desc.offset_ = offsetStruct;
+ // Align the offset reference
+ offset = amd::alignUp(offset, sizeof(size_t));
+ patchReferences_.insert({desc.offset_, offset});
+ offsetStruct += size;
+ // Adjust the offset of arguments
+ offset += sizeof(size_t);
+ } else {
+ // These objects have forced data size to uint64_t
+ if ((desc.info_.oclObject_ == amd::KernelParameterDescriptor::ImageObject) ||
+ (desc.info_.oclObject_ == amd::KernelParameterDescriptor::SamplerObject) ||
+ (desc.info_.oclObject_ == amd::KernelParameterDescriptor::QueueObject)) {
+ offset = amd::alignUp(offset, sizeof(uint64_t));
+ desc.offset_ = offset;
+ offset += sizeof(uint64_t);
+ } else {
+ offset = amd::alignUp(offset, arguments_[i]->alignment_);
+ desc.offset_ = offset;
+ offset += size;
+ }
+ }
+ // Update read only flag
+ desc.info_.readOnly_ = (arguments_[i]->access_ == HSAIL_ACCESS_TYPE_RO) ? true : false;
+
params.push_back(desc);
if (arguments_[i]->type_ == HSAIL_ARGTYPE_IMAGE) {
@@ -464,7 +530,7 @@ void HSAILKernel::initArgList(const aclArgData* aclArg) {
}
}
- createSignature(params);
+ createSignature(params, hiddenParams, amd::KernelSignature::ABIVersion_1);
}
void HSAILKernel::initHsailArgs(const aclArgData* aclArg) {
@@ -869,247 +935,79 @@ void HSAILKernel::findLocalWorkSize(size_t workDim, const amd::NDRange& gblWorkS
}
}
-template
-inline void WriteAqlArg(
- unsigned char** dst, //!< The write pointer to the buffer
- const T* src, //!< The source pointer
- uint size, //!< The size in bytes to copy
- uint alignment //!< The alignment to follow while writing to the buffer
-) {
- *dst = amd::alignUp(*dst, alignment);
- memcpy(*dst, src, size);
- *dst += size;
-}
-
-template <>
-inline void WriteAqlArg(
- unsigned char** dst, //!< The write pointer to the buffer
- const uint32_t* src, //!< The source pointer
- uint size, //!< The size in bytes to copy
- uint alignment //!< The alignment to follow while writing to the buffer
-) {
- *dst = amd::alignUp(*dst, alignment);
- *(reinterpret_cast(*dst)) = *src;
- *dst += size;
-}
-
-template <>
-inline void WriteAqlArg(
- unsigned char** dst, //!< The write pointer to the buffer
- const uint64_t* src, //!< The source pointer
- uint size, //!< The size in bytes to copy
- uint alignment //!< The alignment to follow while writing to the buffer
-) {
- *dst = amd::alignUp(*dst, alignment);
- *(reinterpret_cast(*dst)) = *src;
- *dst += size;
-}
-
-const uint16_t kDispatchPacketHeader = (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
- (1 << HSA_PACKET_HEADER_BARRIER) |
- (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
- (HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
-
hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments(
VirtualGPU& gpu, const amd::Kernel& kernel, const amd::NDRangeContainer& sizes,
- const_address parameters, bool nativeMem, uint64_t vmDefQueue, uint64_t* vmParentWrap) const {
- static const bool WaitOnBusyEngine = true;
- uint64_t ldsAddress = ldsSize();
- address aqlArgBuf = gpu.cb(0)->SysMemCopy();
- bool srdResource = false;
+ const_address parameters, size_t ldsAddress, uint64_t vmDefQueue, uint64_t* vmParentWrap) const {
+ uint64_t argList;
+ address aqlArgBuf = gpu.managedBuffer().reserve(
+ argsBufferSize() + sizeof(hsa_kernel_dispatch_packet_t), &argList);
+ gpu.addVmMemory(gpu.managedBuffer().activeMemory());
if (dynamicParallelism()) {
// Provide the host parent AQL wrap object to the kernel
AmdAqlWrap wrap = {};
wrap.state = AQL_WRAP_BUSY;
- const ConstantBuffer* cb = gpu.cb(1);
- *vmParentWrap = cb->UploadDataToHw(&wrap, sizeof(AmdAqlWrap));
- gpu.addVmMemory(cb->ActiveMemory());
+ *vmParentWrap = gpu.cb(1)->UploadDataToHw(&wrap, sizeof(AmdAqlWrap));
+ gpu.addVmMemory(gpu.cb(1)->ActiveMemory());
}
const amd::KernelSignature& signature = kernel.signature();
- const amd::KernelParameters& kernelParams = kernel.parameters();
- amd::Memory* const* memories =
- reinterpret_cast(parameters + kernelParams.memoryObjOffset());
- // Find all parameters for the current kernel
- for (auto arg : arguments_) {
- const_address paramaddr = nullptr;
- if (arg->index_ != uint(-1)) {
- paramaddr = parameters + signature.at(arg->index_).offset_;
- }
-
- // Handle the hidden arguments first, as they do not have a
- // matching parameter in the OCL signature (not a valid arg->index_)
- switch (arg->type_) {
- case HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_X: {
- size_t offset_x = sizes.dimensions() >= 1 ? sizes.offset()[0] : 0;
- assert(arg->size_ == sizeof(offset_x) && "check the sizes");
- WriteAqlArg(&aqlArgBuf, &offset_x, arg->size_, arg->alignment_);
+ // Check if runtime has to setup hidden arguments
+ for (const auto& it : signature.hiddenParameters()) {
+ size_t offset;
+ switch (it.info_.oclObject_) {
+ case amd::KernelParameterDescriptor::HiddenNone:
+ //WriteAqlArgAt(aqlArgBuf, &zero, it.size_, it.offset_);
break;
- }
- case HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Y: {
- size_t offset_y = sizes.dimensions() >= 2 ? sizes.offset()[1] : 0;
- assert(arg->size_ == sizeof(offset_y) && "check the sizes");
- WriteAqlArg(&aqlArgBuf, &offset_y, arg->size_, arg->alignment_);
+ case amd::KernelParameterDescriptor::HiddenGlobalOffsetX:
+ offset = sizes.offset()[0];
+ WriteAqlArgAt(const_cast(parameters), &offset, it.size_, it.offset_);
break;
- }
- case HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Z: {
- size_t offset_z = sizes.dimensions() == 3 ? sizes.offset()[2] : 0;
- assert(arg->size_ == sizeof(offset_z) && "check the sizes");
- WriteAqlArg(&aqlArgBuf, &offset_z, arg->size_, arg->alignment_);
+ case amd::KernelParameterDescriptor::HiddenGlobalOffsetY:
+ if (sizes.dimensions() >= 2) {
+ offset = sizes.offset()[1];
+ WriteAqlArgAt(const_cast(parameters), &offset, it.size_, it.offset_);
+ }
break;
- }
- case HSAIL_ARGTYPE_HIDDEN_PRINTF_BUFFER: {
- size_t bufferPtr = 0;
+ case amd::KernelParameterDescriptor::HiddenGlobalOffsetZ:
+ if (sizes.dimensions() >= 3) {
+ offset = sizes.offset()[2];
+ WriteAqlArgAt(const_cast(parameters), &offset, it.size_, it.offset_);
+ }
+ break;
+ case amd::KernelParameterDescriptor::HiddenPrintfBuffer:
if ((printfInfo().size() > 0) &&
// and printf buffer was allocated
(gpu.printfDbgHSA().dbgBuffer() != nullptr)) {
// and set the fourth argument as the printf_buffer pointer
- bufferPtr = static_cast(gpu.printfDbgHSA().dbgBuffer()->vmAddress());
+ size_t bufferPtr = static_cast(gpu.printfDbgHSA().
+ dbgBuffer()->vmAddress());
gpu.addVmMemory(gpu.printfDbgHSA().dbgBuffer());
- }
- assert(arg->size_ == sizeof(bufferPtr) && "check the sizes");
- WriteAqlArg(&aqlArgBuf, &bufferPtr, arg->size_, arg->alignment_);
- break;
- }
- case HSAIL_ARGTYPE_HIDDEN_DEFAULT_QUEUE:
- assert(arg->size_ == sizeof(static_cast(vmDefQueue)) && "check the sizes");
- WriteAqlArg(&aqlArgBuf, &vmDefQueue, arg->size_, arg->alignment_);
- break;
- case HSAIL_ARGTYPE_HIDDEN_COMPLETION_ACTION:
- assert(arg->size_ == sizeof(static_cast(*vmParentWrap)) && "check the sizes");
- WriteAqlArg(&aqlArgBuf, vmParentWrap, arg->size_, arg->alignment_);
- break;
- case HSAIL_ARGTYPE_HIDDEN_NONE: {
- void* zero = 0;
- assert(arg->size_ <= sizeof(zero) && "check the sizes");
- WriteAqlArg(&aqlArgBuf, &zero, arg->size_, arg->alignment_);
- break;
- }
- case HSAIL_ARGTYPE_POINTER: {
- // If it is a local pointer
- if (arg->addrQual_ == HSAIL_ADDRESS_LOCAL) {
- ldsAddress = amd::alignUp(ldsAddress, arg->pointeeAlignment_);
- WriteAqlArg(&aqlArgBuf, &ldsAddress, arg->size_, arg->alignment_);
- ldsAddress += *reinterpret_cast(paramaddr);
- break;
- }
- assert(
- (arg->addrQual_ == HSAIL_ADDRESS_GLOBAL || arg->addrQual_ == HSAIL_ADDRESS_CONSTANT) &&
- "Unsupported address qualifier");
- WriteAqlArg(&aqlArgBuf, paramaddr, sizeof(paramaddr), sizeof(paramaddr));
- break;
- }
- case HSAIL_ARGTYPE_REFERENCE: {
- const ConstantBuffer* cb = gpu.cb(1);
- // Copy the current structure into CB1
- size_t gpuPtr = static_cast(cb->UploadDataToHw(paramaddr, arg->size_));
- // Then use a pointer in aqlArgBuffer to CB1
- WriteAqlArg(&aqlArgBuf, &gpuPtr, sizeof(size_t), sizeof(size_t));
- gpu.addVmMemory(cb->ActiveMemory());
- break;
- }
- case HSAIL_ARGTYPE_VALUE:
- if (arg->size_ == sizeof(uint32_t)) {
- WriteAqlArg(&aqlArgBuf, reinterpret_cast(paramaddr),
- sizeof(uint32_t), arg->alignment_);
- } else if (arg->size_ == sizeof(uint64_t)) {
- WriteAqlArg(&aqlArgBuf, reinterpret_cast(paramaddr),
- sizeof(uint64_t), arg->alignment_);
- } else {
- WriteAqlArg(&aqlArgBuf, paramaddr, arg->size_, arg->alignment_);
+ WriteAqlArgAt(const_cast(parameters), &bufferPtr, it.size_, it.offset_);
}
break;
- case HSAIL_ARGTYPE_IMAGE: {
- Image* image = nullptr;
- amd::Memory* mem = nullptr;
- uint32_t index = signature.at(arg->index_).info_.arrayIndex_;
- if (nativeMem) {
- image = reinterpret_cast(memories)[index];
- if (nullptr != image) {
- mem = image->owner();
- }
- } else {
- mem = memories[index];
- if (mem != nullptr) {
- image = static_cast(dev().getGpuMemory(mem));
- }
- }
-
- //! \note Special case for the image views.
- //! Copy SRD to CB1, so blit manager will be able to release
- //! this view without a wait for SRD resource.
- if (image->memoryType() == Resource::ImageView) {
- // Copy the current image SRD into CB1
- const ConstantBuffer* cb = gpu.cb(1);
- uint64_t srd = cb->UploadDataToHw(image->hwState(), HsaImageObjectSize);
- // Then use a pointer in aqlArgBuffer to CB1
- WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd), sizeof(srd));
- gpu.addVmMemory(cb->ActiveMemory());
- } else {
- uint64_t srd = image->hwSrd();
- WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd), sizeof(srd));
- srdResource = true;
- }
-
- if (image->desc().isDoppTexture_) {
- gpu.addDoppRef(image, kernel.parameters().getExecNewVcop(),
- kernel.parameters().getExecPfpaVcop());
+ case amd::KernelParameterDescriptor::HiddenDefaultQueue:
+ if (vmDefQueue != 0) {
+ WriteAqlArgAt(const_cast(parameters), &vmDefQueue, it.size_, it.offset_);
}
break;
- }
- case HSAIL_ARGTYPE_SAMPLER: {
- uint32_t index = signature.at(arg->index_).info_.arrayIndex_;
- const amd::Sampler* sampler = reinterpret_cast(parameters +
- kernelParams.samplerObjOffset())[index];
- const Sampler* gpuSampler = static_cast(sampler->getDeviceSampler(dev()));
- uint64_t srd = gpuSampler->hwSrd();
- WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd), sizeof(srd));
- srdResource = true;
- break;
- }
- case HSAIL_ARGTYPE_QUEUE: {
- uint32_t index = signature.at(arg->index_).info_.arrayIndex_;
- const amd::DeviceQueue* queue = reinterpret_cast(
- parameters + kernelParams.queueObjOffset())[index];
- VirtualGPU* gpuQueue = static_cast(queue->vDev());
- uint64_t vmQueue;
- if (dev().settings().useDeviceQueue_) {
- vmQueue = gpuQueue->vQueue()->vmAddress();
- } else {
- if (!gpu.createVirtualQueue(queue->size())) {
- LogError("Virtual queue creation failed!");
- return nullptr;
- }
- vmQueue = gpu.vQueue()->vmAddress();
+ case amd::KernelParameterDescriptor::HiddenCompletionAction:
+ if (*vmParentWrap != 0) {
+ WriteAqlArgAt(const_cast(parameters), vmParentWrap, it.size_, it.offset_);
}
- WriteAqlArg(&aqlArgBuf, &vmQueue, sizeof(vmQueue), sizeof(vmQueue));
break;
- }
- default:
- LogError(" Unsupported argument type ");
- return nullptr;
}
}
- if (ldsAddress > dev().info().localMemSize_) {
- LogError("No local memory available\n");
- return nullptr;
- }
+ // Load all kernel arguments
+ WriteAqlArgAt(aqlArgBuf, parameters, signature.paramsSize(), 0);
+ assert(argsBufferSize() == amd::alignUp(signature.paramsSize(), 16) &&
+ "A mismatch of sizes of arguments between compiler and runtime!");
-#if defined(WITH_LIGHTNING_COMPILER)
- // Check there is no arguments' buffer overflow. We may not use all the
- // hidden argument slots.
- assert(aqlArgBuf <= (gpu.cb(0)->SysMemCopy() + argsBufferSize()));
-#else // !defined(WITH_LIGHTNING_COMPILER)
- // HSAIL kernarg segment size is rounded up to multiple of 16.
- aqlArgBuf = amd::alignUp(aqlArgBuf, 16);
- assert((aqlArgBuf == (gpu.cb(0)->SysMemCopy() + argsBufferSize())) &&
- "Size and the number of arguments don't match!");
-#endif // !defined(WITH_LIGHTNING_COMPILER)
- hsa_kernel_dispatch_packet_t* hsaDisp =
- reinterpret_cast(gpu.cb(0)->SysMemCopy() + argsBufferSize());
+ //hsa_kernel_dispatch_packet_t disp;
+ hsa_kernel_dispatch_packet_t* hsaDisp = reinterpret_cast(
+ gpu.cb(0)->SysMemCopy());
amd::NDRange local(sizes.local());
const amd::NDRange& global = sizes.global();
@@ -1117,6 +1015,12 @@ hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments(
// Check if runtime has to find local workgroup size
findLocalWorkSize(sizes.dimensions(), sizes.global(), local);
+ constexpr uint16_t kDispatchPacketHeader =
+ (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
+ (1 << HSA_PACKET_HEADER_BARRIER) |
+ (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
+ (HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
+
hsaDisp->header = kDispatchPacketHeader;
hsaDisp->setup = sizes.dimensions();
@@ -1134,28 +1038,16 @@ hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments(
hsaDisp->group_segment_size = ldsAddress - ldsSize();
hsaDisp->kernel_object = gpuAqlCode();
- const ConstantBuffer* cb = gpu.cb(0);
- uint64_t argList = cb->UploadDataToHw(
- argsBufferSize() + sizeof(hsa_kernel_dispatch_packet_t));
-
hsaDisp->kernarg_address = reinterpret_cast(argList);
hsaDisp->reserved2 = 0;
hsaDisp->completion_signal.handle = 0;
+ memcpy(aqlArgBuf + argsBufferSize(), hsaDisp, sizeof(hsa_kernel_dispatch_packet_t));
- gpu.addVmMemory(cb->ActiveMemory());
- gpu.addVmMemory(&prog().codeSegGpu());
- for (pal::Memory* mem : prog().globalStores()) {
- gpu.addVmMemory(mem);
- }
if (AMD_HSA_BITS_GET(cpuAqlCode_->kernel_code_properties,
- AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_QUEUE_PTR)) {
+ AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_QUEUE_PTR)) {
gpu.addVmMemory(gpu.hsaQueueMem());
}
- if (srdResource || prog().isStaticSampler()) {
- dev().srds().fillResourceList(gpu);
- }
-
return hsaDisp;
}
@@ -1398,6 +1290,8 @@ static inline cl_kernel_arg_type_qualifier GetOclTypeQual(const KernelArgMD& lcA
void LightningKernel::initArgList(const KernelMD& kernelMD) {
device::Kernel::parameters_t params;
+ device::Kernel::parameters_t hiddenParams;
+ size_t offsetStruct = argsBufferSize();
size_t offset = 0;
@@ -1426,20 +1320,27 @@ void LightningKernel::initArgList(const KernelMD& kernelMD) {
arg->index_ = isHidden ? uint(-1) : params.size();
arguments_.push_back(arg);
-
- if (isHidden) {
- continue;
- }
-
// Initialize Device kernel parameters
amd::KernelParameterDescriptor desc;
+ if (isHidden) {
+ offset = amd::alignUp(offset, arguments_[i]->alignment_);
+ desc.offset_ = offset;
+ desc.size_ = arguments_[i]->size_;
+ offset += arguments_[i]->size_;
+ desc.info_.oclObject_ = GetOclArgumentType(arguments_[i]);
+ hiddenParams.push_back(desc);
+ continue;
+ }
+
desc.name_ = lcArg.mName.c_str();
desc.type_ = GetOclType(arg);
desc.addressQualifier_ = GetOclAddrQual(arg);
desc.accessQualifier_ = GetOclAccessQual(arg);
desc.typeQualifier_ = GetOclTypeQual(lcArg);
desc.typeName_ = lcArg.mTypeName.c_str();
+ desc.info_.oclObject_ = GetOclArgumentType(arg);
+ desc.info_.arrayIndex_ = arg->pointeeAlignment_;
// Make a check if it is local or global
if (desc.addressQualifier_ == CL_KERNEL_ARG_ADDRESS_LOCAL) {
@@ -1456,14 +1357,37 @@ void LightningKernel::initArgList(const KernelMD& kernelMD) {
// Local memory for CPU
size = sizeof(cl_mem);
}
- offset = (size_t)amd::alignUp(offset, std::min(size, size_t(16)));
- desc.offset_ = offset;
- offset += amd::alignUp(size, sizeof(uint32_t));
+ // Check if HSAIL expects data by reference and allocate it behind
+ if (arguments_[i]->type_ == HSAIL_ARGTYPE_REFERENCE) {
+ desc.offset_ = offsetStruct;
+ // Align the offset reference
+ offset = amd::alignUp(offset, sizeof(size_t));
+ patchReferences_.insert({ desc.offset_, offset });
+ offsetStruct += size;
+ // Adjust the offset of arguments
+ offset += sizeof(size_t);
+ }
+ else {
+ // These objects have forced data size to uint64_t
+ if ((desc.info_.oclObject_ == amd::KernelParameterDescriptor::ImageObject) ||
+ (desc.info_.oclObject_ == amd::KernelParameterDescriptor::SamplerObject) ||
+ (desc.info_.oclObject_ == amd::KernelParameterDescriptor::QueueObject)) {
+ offset = amd::alignUp(offset, sizeof(uint64_t));
+ desc.offset_ = offset;
+ offset += sizeof(uint64_t);
+ } else {
+ offset = amd::alignUp(offset, arguments_[i]->alignment_);
+ desc.offset_ = offset;
+ offset += size;
+ }
+ }
+ // Update read only flag
+ desc.info_.readOnly_ = (arguments_[i]->access_ == HSAIL_ACCESS_TYPE_RO) ? true : false;
params.push_back(desc);
}
- createSignature(params);
+ createSignature(params, hiddenParams, amd::KernelSignature::ABIVersion_1);
}
static const KernelMD* FindKernelMetadata(const CodeObjectMD* programMD, const std::string& name) {
diff --git a/projects/clr/rocclr/runtime/device/pal/palkernel.hpp b/projects/clr/rocclr/runtime/device/pal/palkernel.hpp
index 66e4132055..7ffc144c8b 100644
--- a/projects/clr/rocclr/runtime/device/pal/palkernel.hpp
+++ b/projects/clr/rocclr/runtime/device/pal/palkernel.hpp
@@ -182,7 +182,7 @@ class HSAILKernel : public device::Kernel {
const amd::Kernel& kernel, //!< AMD kernel object
const amd::NDRangeContainer& sizes, //!< NDrange container
const_address parameters, //!< Application arguments for the kernel
- bool nativeMem, //!< Native memory objects are passed
+ size_t ldsAddress, //!< LDS address that includes all arguments.
uint64_t vmDefQueue, //!< GPU VM default queue pointer
uint64_t* vmParentWrap //!< GPU VM parent aql wrap object
) const;
@@ -204,6 +204,8 @@ class HSAILKernel : public device::Kernel {
return waveLimiter_.getWavesPerSH(vdev);
};
+ const std::unordered_map& patch() const { return patchReferences_; }
+
private:
//! Disable copy constructor
HSAILKernel(const HSAILKernel&);
@@ -234,6 +236,7 @@ class HSAILKernel : public device::Kernel {
const HSAILProgram& prog_; //!< Reference to the parent program
std::vector printf_; //!< Format strings for GPU printf support
uint index_; //!< Kernel index in the program
+ std::unordered_map patchReferences_; //!< Patch table for references
uint64_t code_; //!< GPU memory pointer to the kernel
size_t codeSize_; //!< Size of ISA code
diff --git a/projects/clr/rocclr/runtime/device/pal/palmemory.hpp b/projects/clr/rocclr/runtime/device/pal/palmemory.hpp
index 3f47b22e12..00fd8736d3 100644
--- a/projects/clr/rocclr/runtime/device/pal/palmemory.hpp
+++ b/projects/clr/rocclr/runtime/device/pal/palmemory.hpp
@@ -219,6 +219,8 @@ class Image : public pal::Memory {
size_t* slicePitch = NULL //!< Slice for the mapped memory
);
+ virtual uint64_t virtualAddress() const override { return hwSrd(); }
+
private:
//! Disable copy constructor
Image(const Image&);
diff --git a/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp b/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp
index b2e373a4f5..7c45951176 100644
--- a/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp
+++ b/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp
@@ -461,9 +461,8 @@ void VirtualGPU::MemoryDependency::validate(VirtualGPU& gpu, const Memory* memor
if (flushL1Cache) {
// Flush cache
if (!gpu.profiling()) {
- gpu.addBarrier();
+ gpu.addBarrier();
}
-
// Clear memory dependency state
const static bool All = true;
clear(!All);
@@ -2112,13 +2111,12 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
return false;
}
}
-
+ size_t ldsSize;
// Check memory dependency and SVM objects
- if (!processMemObjectsHSA(kernel, parameters, nativeMem)) {
+ if (!processMemObjectsHSA(kernel, parameters, nativeMem, ldsSize)) {
LogError("Wrong memory objects!");
return false;
}
-
bool needFlush = false;
// Avoid flushing when PerfCounter is enabled, to make sure PerfStart/dispatch/PerfEnd
// are in the same cmdBuffer
@@ -2194,7 +2192,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
uint64_t vmParentWrap = 0;
// Program the kernel arguments for the GPU execution
hsa_kernel_dispatch_packet_t* aqlPkt = hsaKernel.loadArguments(
- *this, kernel, tmpSizes, parameters, nativeMem, vmDefQueue, &vmParentWrap);
+ *this, kernel, tmpSizes, parameters, ldsSize, vmDefQueue, &vmParentWrap);
if (nullptr == aqlPkt) {
LogError("Couldn't load kernel arguments");
return false;
@@ -2948,7 +2946,7 @@ void VirtualGPU::profileEvent(EngineType engine, bool type) const {
}
bool VirtualGPU::processMemObjectsHSA(const amd::Kernel& kernel, const_address params,
- bool nativeMem) {
+ bool nativeMem, size_t& ldsAddress) {
const amd::KernelParameters& kernelParams = kernel.parameters();
// Mark the tracker with a new kernel,
@@ -3015,68 +3013,155 @@ bool VirtualGPU::processMemObjectsHSA(const amd::Kernel& kernel, const_address p
}
}
+ bool srdResource = false;
amd::Memory* const* memories =
reinterpret_cast(params + kernelParams.memoryObjOffset());
const HSAILKernel& hsaKernel =
static_cast(*(kernel.getDeviceKernel(dev())));
const amd::KernelSignature& signature = kernel.signature();
+ ldsAddress = hsaKernel.ldsSize();
- // Check all parameters for the current kernel
- for (size_t i = 0; i < signature.numParameters(); ++i) {
- const amd::KernelParameterDescriptor& desc = signature.at(i);
- const HSAILKernel::Argument* arg = hsaKernel.argumentAt(i);
-
- // Find if current argument is a buffer
- if ((desc.type_ == T_POINTER) && (arg->addrQual_ != HSAIL_ADDRESS_LOCAL)) {
- Memory* gpuMem = nullptr;
- amd::Memory* mem = nullptr;
- uint32_t index = desc.info_.arrayIndex_;
- if (nativeMem) {
- gpuMem = reinterpret_cast(memories)[index];
- if (nullptr != gpuMem) {
- mem = gpuMem->owner();
- }
- } else {
- mem = memories[index];
- if (mem != nullptr) {
- gpuMem = dev().getGpuMemory(mem);
- // Synchronize data with other memory instances if necessary
- gpuMem->syncCacheFromHost(*this);
- }
- }
- //! This condition is for SVM fine-grain
- if ((gpuMem == nullptr) && dev().isFineGrainedSystem(true)) {
- addBarrier();
- // Clear memory dependency state
- const static bool All = true;
- memoryDependency().clear(!All);
- continue;
- } else if (gpuMem != nullptr) {
- // Check image
- bool readOnly = (desc.accessQualifier_ == CL_KERNEL_ARG_ACCESS_READ_ONLY) ? true : false;
- // Check buffer
- readOnly |= (arg->access_ == HSAIL_ACCESS_TYPE_RO) ? true : false;
- // Validate memory for a dependency in the queue
- memoryDependency().validate(*this, gpuMem, readOnly);
-
- // Wait for resource if it was used on an inactive engine
- //! \note syncCache may call DRM transfer
- constexpr bool WaitOnBusyEngine = true;
- gpuMem->wait(*this, WaitOnBusyEngine);
-
- //! Check if compiler expects read/write
- if ((mem != nullptr) && !desc.info_.readOnly_) {
- mem->signalWrite(&dev());
- }
- addVmMemory(gpuMem);
+ if (!nativeMem) {
+ // Process cache coherency first, since the extra transfers may affect
+ // other mem dependency tracking logic: TS and signalWrite()
+ for (uint i = 0; i < signature.numMemories(); ++i) {
+ amd::Memory* mem = memories[i];
+ if (mem != nullptr) {
+ // Synchronize data with other memory instances if necessary
+ dev().getGpuMemory(mem)->syncCacheFromHost(*this);
}
}
}
- for (pal::Memory* mem : hsaKernel.prog().globalStores()) {
+ // Check all parameters for the current kernel
+ for (size_t i = 0; i < signature.numParameters(); ++i) {
+ const amd::KernelParameterDescriptor& desc = signature.at(i);
+ const amd::KernelParameterDescriptor::InfoData& info = desc.info_;
+
+ // Find if current argument is a buffer
+ if (desc.type_ == T_POINTER) {
+ // If it is a local pointer
+ if (desc.size_ == 0) {
+ ldsAddress = amd::alignUp(ldsAddress, desc.info_.arrayIndex_);
+ // Save the original LDS size
+ size_t ldsSize = *reinterpret_cast(params + desc.offset_);
+ // Patch the LDS address in the original arguments with an LDS address(offset)
+ WriteAqlArgAt(const_cast(params), &ldsAddress, sizeof(void*), desc.offset_);
+ // Add the original size
+ ldsAddress += ldsSize;
+ } else {
+ Memory* gpuMem = nullptr;
+ amd::Memory* mem = nullptr;
+ uint32_t index = info.arrayIndex_;
+ if (nativeMem) {
+ gpuMem = reinterpret_cast(memories)[index];
+ if (nullptr != gpuMem) {
+ mem = gpuMem->owner();
+ }
+ } else {
+ mem = memories[index];
+ if (mem != nullptr) {
+ gpuMem = dev().getGpuMemory(mem);
+ }
+ }
+ //! This condition is for SVM fine-grain
+ if ((gpuMem == nullptr) && dev().isFineGrainedSystem(true)) {
+ addBarrier();
+ // Clear memory dependency state
+ const static bool All = true;
+ memoryDependency().clear(!All);
+ continue;
+ } else if (gpuMem != nullptr) {
+ // Validate memory for a dependency in the queue
+ memoryDependency().validate(*this, gpuMem, info.readOnly_);
+ // Wait for resource if it was used on an inactive engine
+ //! \note syncCache may call DRM transfer
+ constexpr bool WaitOnBusyEngine = true;
+ gpuMem->wait(*this, WaitOnBusyEngine);
+
+ addVmMemory(gpuMem);
+
+ //! Check if compiler expects read/write.
+ //! Note: SVM with subbuffers has an issue with tracking.
+ //! Conformance can send read only subbuffer, but update the region
+ //! in the kernel.
+ if ((mem != nullptr) &&
+ ((!info.readOnly_ && (mem->getSvmPtr() == nullptr)) ||
+ ((mem->getMemFlags() & CL_MEM_READ_ONLY) == 0))) {
+ mem->signalWrite(&dev());
+ }
+ if (info.oclObject_ == amd::KernelParameterDescriptor::ImageObject) {
+ //! \note Special case for the image views.
+ //! Copy SRD to CB1, so blit manager will be able to release
+ //! this view without a wait for SRD resource.
+ if (gpuMem->memoryType() == Resource::ImageView) {
+ // Copy the current image SRD into CB1
+ uint64_t srd = cb(1)->UploadDataToHw(gpuMem->hwState(), HsaImageObjectSize);
+ // Then use a pointer in aqlArgBuffer to CB1
+ // Patch the GPU VA address in the original arguments
+ WriteAqlArgAt(const_cast(params), &srd, sizeof(srd), desc.offset_);
+ addVmMemory(cb(1)->ActiveMemory());
+ } else {
+ srdResource = true;
+ }
+ if (gpuMem->desc().isDoppTexture_) {
+ addDoppRef(gpuMem, kernel.parameters().getExecNewVcop(),
+ kernel.parameters().getExecPfpaVcop());
+ }
+ }
+ }
+ }
+ }
+ else if (desc.type_ == T_VOID) {
+ if (desc.info_.oclObject_ == amd::KernelParameterDescriptor::ReferenceObject) {
+ // Copy the current structure into CB1
+ size_t gpuPtr = static_cast(cb(1)->UploadDataToHw(params, desc.size_));
+ // Then use a pointer in aqlArgBuffer to CB1
+ const auto it = hsaKernel.patch().find(desc.offset_);
+ // Patch the GPU VA address in the original arguments
+ WriteAqlArgAt(const_cast(params), &gpuPtr, sizeof(size_t), it->second);
+ addVmMemory(cb(1)->ActiveMemory());
+ }
+ }
+ else if (desc.type_ == T_SAMPLER) {
+ srdResource = true;
+ } else if (desc.type_ == T_QUEUE) {
+ uint32_t index = desc.info_.arrayIndex_;
+ const amd::DeviceQueue* queue = reinterpret_cast(
+ params + kernelParams.queueObjOffset())[index];
+ VirtualGPU* gpuQueue = static_cast(queue->vDev());
+ uint64_t vmQueue;
+ if (dev().settings().useDeviceQueue_) {
+ vmQueue = gpuQueue->vQueue()->vmAddress();
+ } else {
+ if (!createVirtualQueue(queue->size())) {
+ LogError("Virtual queue creation failed!");
+ return false;
+ }
+ vmQueue = vQueue()->vmAddress();
+ }
+ // Patch the GPU VA address in the original arguments
+ WriteAqlArgAt(const_cast(params), &vmQueue, sizeof(vmQueue), desc.offset_);
+ break;
+ }
+ }
+
+ if (ldsAddress > dev().info().localMemSize_) {
+ LogError("No local memory available\n");
+ return false;
+ }
+
+ if (srdResource || hsaKernel.prog().isStaticSampler()) {
+ dev().srds().fillResourceList(*this);
+ }
+
+ addVmMemory(&hsaKernel.prog().codeSegGpu());
+
+ for (const pal::Memory* mem : hsaKernel.prog().globalStores()) {
const static bool IsReadOnly = false;
// Validate global store for a dependency in the queue
memoryDependency().validate(*this, mem, IsReadOnly);
+ addVmMemory(mem);
}
return true;
diff --git a/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp b/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp
index fa48024c9d..fccee6d60e 100644
--- a/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp
+++ b/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp
@@ -378,6 +378,9 @@ class VirtualGPU : public device::VirtualDevice {
//! Return xfer buffer for staging operations
XferBuffer& xferWrite() { return writeBuffer_; }
+ //! Return managed buffer for staging operations
+ ManagedBuffer& managedBuffer() { return managedBuffer_; }
+
//! Adds a pinned memory object into a map
void addPinnedMem(amd::Memory* mem);
@@ -529,7 +532,8 @@ class VirtualGPU : public device::VirtualDevice {
//! Detects memory dependency for HSAIL kernels and flushes caches
bool processMemObjectsHSA(const amd::Kernel& kernel, //!< AMD kernel object for execution
const_address params, //!< Pointer to the param's store
- bool nativeMem //!< Native memory objects
+ bool nativeMem, //!< Native memory objects
+ size_t& ldsAddess //!< Returns LDS size, used in the kernel
);
//! Common function for fill memory used by both svm Fill and non-svm fill
@@ -644,4 +648,33 @@ uint VirtualGPU::Queue::submit(bool forceFlush) {
return id;
}
+template
+inline void WriteAqlArgAt(
+ unsigned char* dst, //!< The write pointer to the buffer
+ const T* src, //!< The source pointer
+ uint size, //!< The size in bytes to copy
+ size_t offset //!< The alignment to follow while writing to the buffer
+) {
+ memcpy(dst + offset, src, size);
+}
+
+template <>
+inline void WriteAqlArgAt(
+ unsigned char* dst, //!< The write pointer to the buffer
+ const uint32_t* src, //!< The source pointer
+ uint size, //!< The size in bytes to copy
+ size_t offset //!< The alignment to follow while writing to the buffer
+) {
+ *(reinterpret_cast(dst + offset)) = *src;
+}
+
+template <>
+inline void WriteAqlArgAt(
+ unsigned char* dst, //!< The write pointer to the buffer
+ const uint64_t* src, //!< The source pointer
+ uint size, //!< The size in bytes to copy
+ size_t offset //!< The alignment to follow while writing to the buffer
+) {
+ *(reinterpret_cast(dst + offset)) = *src;
+}
/*@}*/} // namespace pal
diff --git a/projects/clr/rocclr/runtime/device/rocm/rockernel.cpp b/projects/clr/rocclr/runtime/device/rocm/rockernel.cpp
index f0bf4e95c6..b7cf1ed1c3 100644
--- a/projects/clr/rocclr/runtime/device/rocm/rockernel.cpp
+++ b/projects/clr/rocclr/runtime/device/rocm/rockernel.cpp
@@ -581,7 +581,8 @@ void HSAILKernel::initArguments(const aclArgData* aclArg) {
params.push_back(desc);
}
- createSignature(params);
+ device::Kernel::parameters_t hiddenParams;
+ createSignature(params, hiddenParams, amd::KernelSignature::ABIVersion_0);
}
#endif // defined(WITH_COMPILER_LIB)
@@ -660,8 +661,8 @@ void LightningKernel::initArguments(const KernelMD& kernelMD) {
params.push_back(desc);
}
-
- createSignature(params);
+ device::Kernel::parameters_t hiddenParams;
+ createSignature(params, hiddenParams, amd::KernelSignature::ABIVersion_0);
}
#endif // defined(WITH_LIGHTNING_COMPILER)
diff --git a/projects/clr/rocclr/runtime/platform/kernel.cpp b/projects/clr/rocclr/runtime/platform/kernel.cpp
index 38fc9cabe0..a4616033aa 100644
--- a/projects/clr/rocclr/runtime/platform/kernel.cpp
+++ b/projects/clr/rocclr/runtime/platform/kernel.cpp
@@ -243,13 +243,17 @@ void KernelParameters::release(address mem, const amd::Device& device) const {
}
KernelSignature::KernelSignature(const std::vector& params,
- const std::string& attrib)
+ const std::string& attrib,
+ const std::vector& hiddenParams,
+ uint32_t version)
: params_(params)
+ , hiddenParams_(hiddenParams)
, attributes_(attrib)
, paramsSize_(0)
, numMemories_(0)
, numSamplers_(0)
- , numQueues_(0) {
+ , numQueues_(0)
+ , version_(version) {
size_t maxOffset = 0;
size_t last = 0;
// Find the last entry
@@ -283,7 +287,15 @@ KernelSignature::KernelSignature(const std::vector& p
if (lastSize == 0 /* local mem */) {
lastSize = sizeof(cl_mem);
}
- paramsSize_ = params[last].offset_ + alignUp(lastSize, sizeof(intptr_t));
+ // Note: It's a special case. HW ABI expects 64 bit for SRD, regardless of the binary.
+ // Force the size to 64 bit for those cases.
+ if ((params[last].info_.oclObject_ == amd::KernelParameterDescriptor::ImageObject) ||
+ (params[last].info_.oclObject_ == amd::KernelParameterDescriptor::SamplerObject) ||
+ (params[last].info_.oclObject_ == amd::KernelParameterDescriptor::QueueObject)) {
+ lastSize = alignUp(lastSize, sizeof(uint64_t));
+ }
+ paramsSize_ = params[last].offset_ + lastSize;
+ paramsSize_ = alignUp(paramsSize_, sizeof(intptr_t));
}
}
} // namespace amd
diff --git a/projects/clr/rocclr/runtime/platform/kernel.hpp b/projects/clr/rocclr/runtime/platform/kernel.hpp
index 838c5d7198..f506241ee0 100644
--- a/projects/clr/rocclr/runtime/platform/kernel.hpp
+++ b/projects/clr/rocclr/runtime/platform/kernel.hpp
@@ -36,18 +36,30 @@ class Program;
class KernelSignature : public HeapObject {
private:
std::vector params_;
+ std::vector hiddenParams_;
std::string attributes_; //!< The kernel attributes
uint32_t paramsSize_;
uint32_t numMemories_;
uint32_t numSamplers_;
uint32_t numQueues_;
+ uint32_t version_;
public:
+ enum {
+ ABIVersion_0 = 0, //! ABI constructed based on the OCL semantics
+ ABIVersion_1 = 1 //! ABI constructed based on the HW ABI returned from the compiler
+ };
+
//! Default constructor
- KernelSignature() : paramsSize_(0), numMemories_(0), numSamplers_(0), numQueues_(0) {}
+ KernelSignature():
+ paramsSize_(0), numMemories_(0), numSamplers_(0),
+ numQueues_(0), version_(ABIVersion_0) {}
//! Construct a new signature.
- KernelSignature(const std::vector& params, const std::string& attrib);
+ KernelSignature(const std::vector& params,
+ const std::string& attrib,
+ const std::vector& hiddenParams,
+ uint32_t version);
//! Return the number of parameters
size_t numParameters() const { return params_.size(); }
@@ -72,8 +84,17 @@ class KernelSignature : public HeapObject {
//! Returns the number of queue objects.
uint32_t numQueues() const { return numQueues_; }
+ //! Returns the signature version
+ uint32_t version() const { return version_; }
+
//! Return the kernel attributes
const std::string& attributes() const { return attributes_; }
+
+ const std::vector& hiddenParameters() const
+ { return hiddenParams_; }
+
+ const std::vector& parameters() const
+ { return params_; }
};
// @todo: look into a copy-on-write model instead of copy-on-read.
diff --git a/projects/clr/rocclr/runtime/platform/program.cpp b/projects/clr/rocclr/runtime/platform/program.cpp
index 055c351b07..9b68004437 100644
--- a/projects/clr/rocclr/runtime/platform/program.cpp
+++ b/projects/clr/rocclr/runtime/platform/program.cpp
@@ -604,8 +604,8 @@ bool Program::ParseAllOptions(const std::string& options, option::Options& parse
}
bool Symbol::setDeviceKernel(const Device& device, const device::Kernel* func) {
- // FIXME_lmoriche: check that the signatures are compatible
- if (deviceKernels_.size() == 0) {
+ if (deviceKernels_.size() == 0 ||
+ (func->signature().version() > KernelSignature::ABIVersion_0)) {
signature_ = func->signature();
}
deviceKernels_[&device] = func;