From 9cd2db67f6fcff17111a3af11da54f2c56a0d2be Mon Sep 17 00:00:00 2001
From: foreman
Date: Fri, 3 Aug 2018 16:05:12 -0400
Subject: [PATCH] P4 to Git Change 1589476 by axie@axie-rocm-opencl on
2018/08/03 15:54:24
SWDEV-79445 - OCL generic changes and code clean-up
- Optimize setup of kernel arguments.
- Add HW ABI support in the abstraction layer
- Remove arguments parsing loop from the kernel launch. Memory processing will be responsible for dependency tracking and patching of arguments.
ReviewBoardURL = http://ocltc.amd.com/reviews/r/15400/
Tests:
1. ./run_conformance.py ./opencl_conformance_tests_reallyquick.csv CL_DEVICE_TYPE_GPU for openCL 1.2: OpenCL-GL sharing failed. This is not a regression.
2. ./ocltst -m oclruntime.so -A oclruntime.exclude
3. ./run_conformance.py opencl_conformance_tests_lightning.csv CL_DEVICE_TYPE_GPU : PASS
4. teamcity test: http://ocltc.amd.com:8111/viewModification.html?modId=104598&personal=true&buildTypeId=&tab=vcsModificationBuilds&show_all_builds=true
Affected files ...
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rockernel.cpp#39 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rockernel.hpp#23 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocsettings.cpp#34 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocsettings.hpp#14 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocvirtual.cpp#60 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocvirtual.hpp#17 edit
---
rocclr/runtime/device/rocm/rockernel.cpp | 134 +++++-
rocclr/runtime/device/rocm/rockernel.hpp | 3 +
rocclr/runtime/device/rocm/rocsettings.cpp | 1 -
rocclr/runtime/device/rocm/rocsettings.hpp | 1 -
rocclr/runtime/device/rocm/rocvirtual.cpp | 459 +++++++++------------
rocclr/runtime/device/rocm/rocvirtual.hpp | 33 +-
6 files changed, 356 insertions(+), 275 deletions(-)
diff --git a/rocclr/runtime/device/rocm/rockernel.cpp b/rocclr/runtime/device/rocm/rockernel.cpp
index 68e6d96944..47268bc612 100644
--- a/rocclr/runtime/device/rocm/rockernel.cpp
+++ b/rocclr/runtime/device/rocm/rockernel.cpp
@@ -231,6 +231,37 @@ static inline ROC_ADDRESS_QUALIFIER GetKernelAddrQual(const aclArgData* argInfo)
return ROC_ADDRESS_ERROR;
}
+inline static uint32_t GetOclArgumentType(const HSAILKernel::Argument* arg) {
+ switch (arg->type_){
+ case ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_X:
+ return amd::KernelParameterDescriptor::HiddenGlobalOffsetX;
+ case ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Y:
+ return amd::KernelParameterDescriptor::HiddenGlobalOffsetY;
+ case ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Z:
+ return amd::KernelParameterDescriptor::HiddenGlobalOffsetZ;
+ case ROC_ARGTYPE_HIDDEN_PRINTF_BUFFER:
+ return amd::KernelParameterDescriptor::HiddenPrintfBuffer;
+ case ROC_ARGTYPE_HIDDEN_DEFAULT_QUEUE:
+ return amd::KernelParameterDescriptor::HiddenDefaultQueue;
+ case ROC_ARGTYPE_HIDDEN_COMPLETION_ACTION:
+ return amd::KernelParameterDescriptor::HiddenCompletionAction;
+ case ROC_ARGTYPE_POINTER:
+ return amd::KernelParameterDescriptor::MemoryObject;
+ case ROC_ARGTYPE_IMAGE:
+ return amd::KernelParameterDescriptor::ImageObject;
+ case ROC_ARGTYPE_REFERENCE:
+ return amd::KernelParameterDescriptor::ReferenceObject;
+ case ROC_ARGTYPE_VALUE:
+ return amd::KernelParameterDescriptor::ValueObject;
+ case ROC_ARGTYPE_SAMPLER:
+ return amd::KernelParameterDescriptor::SamplerObject;
+ case ROC_ARGTYPE_QUEUE:
+ return amd::KernelParameterDescriptor::QueueObject;
+ default:
+ return amd::KernelParameterDescriptor::HiddenNone;
+ }
+}
+
#if defined(WITH_LIGHTNING_COMPILER)
static inline ROC_DATA_TYPE GetKernelDataType(const KernelArgMD& lcArg) {
aclArgDataType dataType;
@@ -514,6 +545,8 @@ static inline cl_kernel_arg_type_qualifier GetOclTypeQual(const aclArgData* argI
#if defined(WITH_COMPILER_LIB)
void HSAILKernel::initArguments(const aclArgData* aclArg) {
device::Kernel::parameters_t params;
+ device::Kernel::parameters_t hiddenParams;
+ size_t offsetStruct = KernargSegmentByteSize();
// Iterate through the arguments and insert into parameterList
for (size_t offset = 0; aclArg->struct_size != 0; aclArg++) {
@@ -539,17 +572,27 @@ void HSAILKernel::initArguments(const aclArgData* aclArg) {
arg->index_ = isHidden ? uint(-1) : params.size();
hsailArgList_.push_back(arg);
+ amd::KernelParameterDescriptor desc;
+
+ // Allocate the hidden arguments, but abstraction layer will skip them
if (isHidden) {
+ offset = amd::alignUp(offset, arg->alignment_);
+ desc.offset_ = offset;
+ desc.size_ = arg->size_;
+ offset += arg->size_;
+ desc.info_.oclObject_ = GetOclArgumentType(arg);
+ hiddenParams.push_back(desc);
continue;
}
- amd::KernelParameterDescriptor desc;
desc.name_ = arg->name_.c_str();
desc.type_ = GetOclType(arg);
desc.addressQualifier_ = GetOclAddrQual(arg);
desc.accessQualifier_ = GetOclAccessQual(arg);
desc.typeQualifier_ = GetOclTypeQual(aclArg);
desc.typeName_ = arg->typeName_.c_str();
+ desc.info_.oclObject_ = GetOclArgumentType(arg);
+ desc.info_.arrayIndex_ = arg->pointeeAlignment_;
// set image related flags
if (arg->type_ == ROC_ARGTYPE_IMAGE) {
@@ -566,19 +609,48 @@ void HSAILKernel::initArguments(const aclArgData* aclArg) {
// and CPU sends the parameters as they are allocated in memory
size_t size = desc.size_;
- offset = amd::alignUp(offset, std::min(size, size_t(16)));
- desc.offset_ = offset;
- offset += amd::alignUp(size, sizeof(uint32_t));
+ // Check if HSAIL expects data by reference and allocate it behind
+ if (arg->type_ == ROC_ARGTYPE_REFERENCE) {
+ desc.offset_ = offsetStruct;
+ // Align the offset reference
+ offset = amd::alignUp(offset, sizeof(size_t));
+ patchReferences_.insert({desc.offset_, offset});
+ offsetStruct += size;
+ // Adjust the offset of arguments
+ offset += sizeof(size_t);
+ }
+ else if ((desc.info_.oclObject_ == amd::KernelParameterDescriptor::ImageObject) ||
+ (desc.info_.oclObject_ == amd::KernelParameterDescriptor::SamplerObject) ||
+ (desc.info_.oclObject_ == amd::KernelParameterDescriptor::QueueObject)) {
+ // These objects have forced data size to uint64_t
+ offset = amd::alignUp(offset, sizeof(uint64_t));
+ desc.offset_ = offset;
+ offset += sizeof(uint64_t);
+ } else {
+ offset = amd::alignUp(offset, arg->alignment_);
+ desc.offset_ = offset;
+ offset += size;
+ }
+
+ // Update read only flag
+ desc.info_.readOnly_ = (arg->access_ == ROC_ACCESS_TYPE_RO) ? true : false;
params.push_back(desc);
}
- createSignature(params, params.size(), amd::KernelSignature::ABIVersion_0);
+
+ // Save the number of OCL arguments
+ uint32_t numParams = params.size();
+ // Append the hidden arguments to the OCL arguments
+ params.insert(params.end(), hiddenParams.begin(), hiddenParams.end());
+ createSignature(params, numParams, amd::KernelSignature::ABIVersion_1);
}
#endif // defined(WITH_COMPILER_LIB)
#if defined(WITH_LIGHTNING_COMPILER)
void LightningKernel::initArguments(const KernelMD& kernelMD) {
device::Kernel::parameters_t params;
+ device::Kernel::parameters_t hiddenParams;
+ size_t offsetStruct = KernargSegmentByteSize();
size_t offset = 0;
@@ -607,19 +679,27 @@ void LightningKernel::initArguments(const KernelMD& kernelMD) {
arg->index_ = isHidden ? uint(-1) : params.size();
hsailArgList_.push_back(arg);
- if (isHidden) {
- continue;
- }
-
// Initialize Device kernel parameters
amd::KernelParameterDescriptor desc;
+ if (isHidden) {
+ offset = amd::alignUp(offset, arg->alignment_);
+ desc.offset_ = offset;
+ desc.size_ = arg->size_;
+ offset += arg->size_;
+ desc.info_.oclObject_ = GetOclArgumentType(arg);
+ hiddenParams.push_back(desc);
+ continue;
+ }
+
desc.name_ = lcArg.mName.c_str();
desc.type_ = GetOclType(arg);
desc.addressQualifier_ = GetOclAddrQual(arg);
desc.accessQualifier_ = GetOclAccessQual(arg);
desc.typeQualifier_ = GetOclTypeQual(lcArg);
desc.typeName_ = lcArg.mTypeName.c_str();
+ desc.info_.oclObject_ = GetOclArgumentType(arg);
+ desc.info_.arrayIndex_ = arg->pointeeAlignment_;
// set image related flags
if (arg->type_ == ROC_ARGTYPE_IMAGE) {
@@ -629,6 +709,7 @@ void LightningKernel::initArguments(const KernelMD& kernelMD) {
flags_.imageWrite_ = true;
}
}
+
desc.size_ = arg->size_;
// Make offset alignment to match CPU metadata, since
@@ -636,13 +717,40 @@ void LightningKernel::initArguments(const KernelMD& kernelMD) {
// and CPU sends the parameters as they are allocated in memory
size_t size = desc.size_;
- offset = (size_t)amd::alignUp(offset, std::min(size, size_t(16)));
- desc.offset_ = offset;
- offset += amd::alignUp(size, sizeof(uint32_t));
+ // Check if HSAIL expects data by reference and allocate it behind
+ if (arg->type_ == ROC_ARGTYPE_REFERENCE) {
+ desc.offset_ = offsetStruct;
+ // Align the offset reference
+ offset = amd::alignUp(offset, sizeof(size_t));
+ patchReferences_.insert({desc.offset_, offset});
+ offsetStruct += size;
+ // Adjust the offset of arguments
+ offset += sizeof(size_t);
+ }
+ else if ((desc.info_.oclObject_ == amd::KernelParameterDescriptor::ImageObject) ||
+ (desc.info_.oclObject_ == amd::KernelParameterDescriptor::SamplerObject) ||
+ (desc.info_.oclObject_ == amd::KernelParameterDescriptor::QueueObject)) {
+ // These objects have forced data size to uint64_t
+ offset = amd::alignUp(offset, sizeof(uint64_t));
+ desc.offset_ = offset;
+ offset += sizeof(uint64_t);
+ } else {
+ offset = amd::alignUp(offset, arg->alignment_);
+ desc.offset_ = offset;
+ offset += size;
+ }
+
+ // Update read only flag
+ desc.info_.readOnly_ = (arg->access_ == ROC_ACCESS_TYPE_RO) ? true : false;
params.push_back(desc);
}
- createSignature(params, params.size(), amd::KernelSignature::ABIVersion_0);
+
+ // Save the number of OCL arguments
+ uint32_t numParams = params.size();
+ // Append the hidden arguments to the OCL arguments
+ params.insert(params.end(), hiddenParams.begin(), hiddenParams.end());
+ createSignature(params, numParams, amd::KernelSignature::ABIVersion_1);
}
#endif // defined(WITH_LIGHTNING_COMPILER)
diff --git a/rocclr/runtime/device/rocm/rockernel.hpp b/rocclr/runtime/device/rocm/rockernel.hpp
index f0b8690e71..0c1c0f7e18 100644
--- a/rocclr/runtime/device/rocm/rockernel.hpp
+++ b/rocclr/runtime/device/rocm/rockernel.hpp
@@ -140,6 +140,8 @@ class Kernel : public device::Kernel {
//! Return TRUE if kernel wirtes images
bool imageWrite() const { return (flags_.imageWrite_) ? true : false; }
+ const std::unordered_map& patch() const { return patchReferences_; }
+
protected:
union Flags {
struct {
@@ -162,6 +164,7 @@ class Kernel : public device::Kernel {
const uint32_t kernargSegmentAlignment_;
size_t kernelDirectiveOffset_;
std::vector printf_;
+ std::unordered_map patchReferences_; //!< Patch table for references
};
#if defined(WITH_COMPILER_LIB)
diff --git a/rocclr/runtime/device/rocm/rocsettings.cpp b/rocclr/runtime/device/rocm/rocsettings.cpp
index a768801d3b..6c6c7d71da 100644
--- a/rocclr/runtime/device/rocm/rocsettings.cpp
+++ b/rocclr/runtime/device/rocm/rocsettings.cpp
@@ -20,7 +20,6 @@ Settings::Settings() {
pollCompletion_ = ENVVAR_HSA_POLL_KERNEL_COMPLETION;
enableLocalMemory_ = HSA_LOCAL_MEMORY_ENABLE;
- enableImageHandle_ = true;
maxWorkGroupSize_ = 1024;
preferredWorkGroupSize_ = 256;
diff --git a/rocclr/runtime/device/rocm/rocsettings.hpp b/rocclr/runtime/device/rocm/rocsettings.hpp
index 1ecd636d2a..d3c601de4c 100644
--- a/rocclr/runtime/device/rocm/rocsettings.hpp
+++ b/rocclr/runtime/device/rocm/rocsettings.hpp
@@ -22,7 +22,6 @@ class Settings : public device::Settings {
uint doublePrecision_ : 1; //!< Enables double precision support
uint pollCompletion_ : 1; //!< Enables polling in HSA
uint enableLocalMemory_ : 1; //!< Enable GPUVM memory
- uint enableImageHandle_ : 1; //!< Use HSAIL image/sampler pointer
uint enableNCMode_ : 1; //!< Enable Non Coherent mode for system memory
uint enablePartialDispatch_ : 1; //!< Enable support for Partial Dispatch
uint imageDMA_ : 1; //!< Enable direct image DMA transfers
diff --git a/rocclr/runtime/device/rocm/rocvirtual.cpp b/rocclr/runtime/device/rocm/rocvirtual.cpp
index 41c95d9995..9d9cdecd34 100644
--- a/rocclr/runtime/device/rocm/rocvirtual.cpp
+++ b/rocclr/runtime/device/rocm/rocvirtual.cpp
@@ -185,8 +185,37 @@ void VirtualGPU::MemoryDependency::clear(bool all) {
}
}
-bool VirtualGPU::processMemObjects(const amd::Kernel& kernel, const_address params) {
- const Kernel& hsaKernel = static_cast(*(kernel.getDeviceKernel(dev())));
+static void fillSampleDescriptor(hsa_ext_sampler_descriptor_t& samplerDescriptor,
+ const amd::Sampler& sampler) {
+ samplerDescriptor.filter_mode = sampler.filterMode() == CL_FILTER_NEAREST
+ ? HSA_EXT_SAMPLER_FILTER_MODE_NEAREST
+ : HSA_EXT_SAMPLER_FILTER_MODE_LINEAR;
+ samplerDescriptor.coordinate_mode = sampler.normalizedCoords()
+ ? HSA_EXT_SAMPLER_COORDINATE_MODE_NORMALIZED
+ : HSA_EXT_SAMPLER_COORDINATE_MODE_UNNORMALIZED;
+ switch (sampler.addressingMode()) {
+ case CL_ADDRESS_CLAMP_TO_EDGE:
+ samplerDescriptor.address_mode = HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE;
+ break;
+ case CL_ADDRESS_REPEAT:
+ samplerDescriptor.address_mode = HSA_EXT_SAMPLER_ADDRESSING_MODE_REPEAT;
+ break;
+ case CL_ADDRESS_CLAMP:
+ samplerDescriptor.address_mode = HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_BORDER;
+ break;
+ case CL_ADDRESS_MIRRORED_REPEAT:
+ samplerDescriptor.address_mode = HSA_EXT_SAMPLER_ADDRESSING_MODE_MIRRORED_REPEAT;
+ break;
+ case CL_ADDRESS_NONE:
+ samplerDescriptor.address_mode = HSA_EXT_SAMPLER_ADDRESSING_MODE_UNDEFINED;
+ break;
+ default:
+ return;
+ }
+}
+
+bool VirtualGPU::processMemObjects(const amd::Kernel& kernel, const_address params, size_t& ldsAddress) {
+ Kernel& hsaKernel = const_cast(static_cast(*(kernel.getDeviceKernel(dev()))));
const amd::KernelSignature& signature = kernel.signature();
const amd::KernelParameters& kernelParams = kernel.parameters();
@@ -256,38 +285,141 @@ bool VirtualGPU::processMemObjects(const amd::Kernel& kernel, const_address para
// Check all parameters for the current kernel
for (size_t i = 0; i < signature.numParameters(); ++i) {
const amd::KernelParameterDescriptor& desc = signature.at(i);
- const Kernel::Argument* arg = hsaKernel.hsailArgAt(i);
Memory* gpuMem = nullptr;
- bool readOnly = false;
amd::Memory* mem = nullptr;
// Find if current argument is a buffer
- if ((desc.type_ == T_POINTER) && (arg->addrQual_ != ROC_ADDRESS_LOCAL)) {
- uint32_t index = desc.info_.arrayIndex_;
- mem = memories[index];
- if (mem != nullptr) {
- gpuMem = static_cast(mem->getDeviceMemory(dev()));
- // Don't sync for internal objects,
- // since they are not shared between devices
- if (gpuMem->owner()->getVirtualDevice() == nullptr) {
- // Synchronize data with other memory instances if necessary
- gpuMem->syncCacheFromHost(*this);
+ if (desc.type_ == T_POINTER) {
+ if (desc.addressQualifier_ == CL_KERNEL_ARG_ADDRESS_LOCAL) {
+ // Align the LDS on the alignment requirement of type pointed to
+ ldsAddress = amd::alignUp(ldsAddress, desc.info_.arrayIndex_);
+ if (desc.size_ == 8) {
+ // Save the original LDS size
+ uint64_t ldsSize = *reinterpret_cast(params + desc.offset_);
+ // Patch the LDS address in the original arguments with an LDS address(offset)
+ WriteAqlArgAt(const_cast(params), &ldsAddress, desc.size_, desc.offset_);
+ // Add the original size
+ ldsAddress += ldsSize;
+ } else {
+ // Save the original LDS size
+ uint32_t ldsSize = *reinterpret_cast(params + desc.offset_);
+ // Patch the LDS address in the original arguments with an LDS address(offset)
+ uint32_t ldsAddr = ldsAddress;
+ WriteAqlArgAt(const_cast(params), &ldsAddr, desc.size_, desc.offset_);
+ // Add the original size
+ ldsAddress += ldsSize;
}
}
- //! This condition is for SVM fine-grain
- if ((gpuMem == nullptr) && dev().isFineGrainedSystem(true)) {
- // Sync AQL packets
- setAqlHeader(kDispatchPacketHeader);
- // Clear memory dependency state
- const static bool All = true;
- memoryDependency().clear(!All);
- continue;
- } else if (gpuMem != nullptr) {
- readOnly |= (arg->access_ == ROC_ACCESS_TYPE_RO);
- // Validate memory for a dependency in the queue
- memoryDependency().validate(*this, gpuMem, readOnly);
+ else {
+ uint32_t index = desc.info_.arrayIndex_;
+ mem = memories[index];
+ if (mem == nullptr) {
+ //! This condition is for SVM fine-grain
+ if (dev().isFineGrainedSystem(true)) {
+ // Sync AQL packets
+ setAqlHeader(kDispatchPacketHeader);
+ // Clear memory dependency state
+ const static bool All = true;
+ memoryDependency().clear(!All);
+ }
+ }
+ else {
+ gpuMem = static_cast(mem->getDeviceMemory(dev()));
+ // Don't sync for internal objects,
+ // since they are not shared between devices
+ if (gpuMem->owner()->getVirtualDevice() == nullptr) {
+ // Synchronize data with other memory instances if necessary
+ gpuMem->syncCacheFromHost(*this);
+ }
+
+ // Validate memory for a dependency in the queue
+ memoryDependency().validate(*this, gpuMem, (desc.info_.readOnly_ == 1));
+
+ assert((desc.addressQualifier_ == CL_KERNEL_ARG_ADDRESS_GLOBAL ||
+ desc.addressQualifier_ == CL_KERNEL_ARG_ADDRESS_CONSTANT) &&
+ "Unsupported address qualifier");
+
+ const bool readOnly =
+#if defined(WITH_LIGHTNING_COMPILER)
+ desc.typeQualifier_ == CL_KERNEL_ARG_TYPE_CONST ||
+#endif // defined(WITH_LIGHTNING_COMPILER)
+ (mem->getMemFlags() & CL_MEM_READ_ONLY) != 0;
+
+ if (!readOnly) {
+ mem->signalWrite(&dev());
+ }
+
+ if (desc.info_.oclObject_ == amd::KernelParameterDescriptor::ImageObject) {
+ Image* image = static_cast(mem->getDeviceMemory(dev()));
+
+ const uint64_t image_srd = image->getHsaImageObject().handle;
+ assert(amd::isMultipleOf(image_srd, sizeof(image_srd)));
+ WriteAqlArgAt(const_cast(params), &image_srd, sizeof(image_srd), desc.offset_);
+ }
+ }
}
}
+ else if (desc.type_ == T_QUEUE) {
+ uint32_t index = desc.info_.arrayIndex_;
+ const amd::DeviceQueue* queue = reinterpret_cast(
+ params + kernelParams.queueObjOffset())[index];
+
+ if (!createVirtualQueue(queue->size()) || !createSchedulerParam()) {
+ return false;
+ }
+ hsaKernel.setDynamicParallelFlag(true);
+ uint64_t vqVA = getVQVirtualAddress();
+ WriteAqlArgAt(const_cast(params), &vqVA, sizeof(vqVA), desc.offset_);
+ }
+ else if (desc.type_ == T_VOID) {
+ if (desc.info_.oclObject_ == amd::KernelParameterDescriptor::ReferenceObject) {
+ const_address srcArgPtr = params + desc.offset_;
+ void* mem = allocKernArg(desc.size_, 128);
+ if (mem == nullptr) {
+ LogError("Out of memory");
+ return false;
+ }
+ memcpy(mem, srcArgPtr, desc.size_);
+ const auto it = hsaKernel.patch().find(desc.offset_);
+ WriteAqlArgAt(const_cast(params), &mem, sizeof(void*), it->second);
+ }
+ }
+ else if (desc.type_ == T_SAMPLER) {
+ uint32_t index = desc.info_.arrayIndex_;
+ const amd::Sampler* sampler = reinterpret_cast(params +
+ kernelParams.samplerObjOffset())[index];
+
+ hsa_ext_sampler_descriptor_t samplerDescriptor;
+ fillSampleDescriptor(samplerDescriptor, *sampler);
+
+ hsa_ext_sampler_t hsa_sampler;
+ hsa_status_t status =
+ hsa_ext_sampler_create(dev().getBackendDevice(), &samplerDescriptor, &hsa_sampler);
+
+ if (status != HSA_STATUS_SUCCESS) {
+ // Wait on a kernel if one is outstanding
+ releaseGpuMemoryFence();
+ // Release the sampler handles allocated for the various
+ // on one or more kernel submissions
+ for (const auto& it: samplerList_) {
+ if (hsa_ext_sampler_destroy(gpu_device_, it) != HSA_STATUS_SUCCESS) {
+ LogWarning("Error destroying device sampler object!");
+ }
+ }
+
+ samplerList_.clear();
+ status = hsa_ext_sampler_create(dev().getBackendDevice(), &samplerDescriptor, &hsa_sampler);
+ if (status != HSA_STATUS_SUCCESS) {
+ LogError("Error creating device sampler object!");
+ return false;
+ }
+ }
+
+ uint64_t sampler_srd = hsa_sampler.handle;
+ WriteAqlArgAt(const_cast(params), &sampler_srd, sizeof(sampler_srd), desc.offset_);
+ samplerList_.push_back(hsa_sampler);
+ // TODO: destroy sampler.
+ }
}
if (hsaKernel.program()->hasGlobalStores()) {
@@ -1438,26 +1570,6 @@ void VirtualGPU::submitMigrateMemObjects(amd::MigrateMemObjectsCommand& vcmd) {
profilingEnd(vcmd);
}
-/*! \brief Writes to the buffer and increments the write pointer to the
- * buffer. Also, ensures that the argument is written to an
- * aligned memory as specified. Return the new write pointer.
- *
- * @param dst The write pointer to the buffer
- * @param src The source pointer
- * @param size The size in bytes to copy
- * @param alignment The alignment to follow while writing to the buffer
- */
-static inline address addArg(address dst, const void* src, size_t size, uint32_t alignment) {
- dst = amd::alignUp(dst, alignment);
- ::memcpy(dst, src, size);
- return dst + size;
-}
-
-static inline address addArg(address dst, const void* src, size_t size) {
- assert(size < UINT32_MAX);
- return addArg(dst, src, size, size);
-}
-
// Over rides the workgroup size fields in the packet with runtime/compiler set sizes
void setRuntimeCompilerLocalSize(hsa_kernel_dispatch_packet_t& dispatchPacket,
amd::NDRangeContainer sizes, device::Kernel* devKernel,
@@ -1584,35 +1696,6 @@ void setRuntimeCompilerLocalSize(hsa_kernel_dispatch_packet_t& dispatchPacket,
}
}
-static void fillSampleDescriptor(hsa_ext_sampler_descriptor_t& samplerDescriptor,
- const amd::Sampler& sampler) {
- samplerDescriptor.filter_mode = sampler.filterMode() == CL_FILTER_NEAREST
- ? HSA_EXT_SAMPLER_FILTER_MODE_NEAREST
- : HSA_EXT_SAMPLER_FILTER_MODE_LINEAR;
- samplerDescriptor.coordinate_mode = sampler.normalizedCoords()
- ? HSA_EXT_SAMPLER_COORDINATE_MODE_NORMALIZED
- : HSA_EXT_SAMPLER_COORDINATE_MODE_UNNORMALIZED;
- switch (sampler.addressingMode()) {
- case CL_ADDRESS_CLAMP_TO_EDGE:
- samplerDescriptor.address_mode = HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE;
- break;
- case CL_ADDRESS_REPEAT:
- samplerDescriptor.address_mode = HSA_EXT_SAMPLER_ADDRESSING_MODE_REPEAT;
- break;
- case CL_ADDRESS_CLAMP:
- samplerDescriptor.address_mode = HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_BORDER;
- break;
- case CL_ADDRESS_MIRRORED_REPEAT:
- samplerDescriptor.address_mode = HSA_EXT_SAMPLER_ADDRESSING_MODE_MIRRORED_REPEAT;
- break;
- case CL_ADDRESS_NONE:
- samplerDescriptor.address_mode = HSA_EXT_SAMPLER_ADDRESSING_MODE_UNDEFINED;
- break;
- default:
- return;
- }
-}
-
bool VirtualGPU::createSchedulerParam()
{
if (nullptr != schedulerParam_) {
@@ -1797,12 +1880,10 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
const_address parameters, void* eventHandle) {
device::Kernel* devKernel = const_cast(kernel.getDeviceKernel(dev()));
Kernel& gpuKernel = static_cast(*devKernel);
-
- const size_t compilerLdsUsage = gpuKernel.WorkgroupGroupSegmentByteSize();
- size_t ldsUsage = compilerLdsUsage;
+ size_t ldsUsage = gpuKernel.WorkgroupGroupSegmentByteSize();
// Check memory dependency and SVM objects
- if (!processMemObjects(kernel, parameters)) {
+ if (!processMemObjects(kernel, parameters, ldsUsage)) {
LogError("Wrong memory objects!");
return false;
}
@@ -1868,58 +1949,46 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
return false;
}
- address argPtr = argBuffer;
- for (auto arg : gpuKernel.hsailArgs()) {
- const_address srcArgPtr = nullptr;
- if (arg->index_ != uint(-1)) {
- srcArgPtr = parameters + signature.at(arg->index_).offset_;
- }
-
- // Handle the hidden arguments first, as they do not have a
- // matching parameter in the OCL signature (not a valid arg->index_)
- switch (arg->type_) {
- case ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_X: {
- size_t offset_x = sizes.dimensions() >= 1 ? newOffset[0] : 0;
- assert(arg->size_ == sizeof(offset_x) && "check the sizes");
- argPtr = addArg(argPtr, &offset_x, arg->size_, arg->alignment_);
+ // Check if runtime has to setup hidden arguments
+ for (uint32_t i = signature.numParameters(); i < signature.numParametersAll(); ++i) {
+ const auto it = signature.at(i);
+ size_t offset;
+ switch (it.info_.oclObject_) {
+ case amd::KernelParameterDescriptor::HiddenNone:
+ break;
+ case amd::KernelParameterDescriptor::HiddenGlobalOffsetX: {
+ offset = newOffset[0];
+ assert(it.size_ == sizeof(offset) && "check the sizes");
+ WriteAqlArgAt(const_cast(parameters), &offset, it.size_, it.offset_);
break;
}
- case ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Y: {
- size_t offset_y = sizes.dimensions() >= 2 ? newOffset[1] : 0;
- assert(arg->size_ == sizeof(offset_y) && "check the sizes");
- argPtr = addArg(argPtr, &offset_y, arg->size_, arg->alignment_);
+ case amd::KernelParameterDescriptor::HiddenGlobalOffsetY: {
+ if (sizes.dimensions() >= 2) {
+ offset = newOffset[1];
+ assert(it.size_ == sizeof(offset) && "check the sizes");
+ WriteAqlArgAt(const_cast(parameters), &offset, it.size_, it.offset_);
+ }
break;
}
- case ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Z: {
- size_t offset_z = sizes.dimensions() == 3 ? newOffset[2] : 0;
- assert(arg->size_ == sizeof(offset_z) && "check the sizes");
- argPtr = addArg(argPtr, &offset_z, arg->size_, arg->alignment_);
+ case amd::KernelParameterDescriptor::HiddenGlobalOffsetZ: {
+ if (sizes.dimensions() >= 3) {
+ offset = newOffset[2];
+ assert(it.size_ == sizeof(offset) && "check the sizes");
+ WriteAqlArgAt(const_cast(parameters), &offset, it.size_, it.offset_);
+ }
break;
}
- case ROC_ARGTYPE_HIDDEN_PRINTF_BUFFER: {
+ case amd::KernelParameterDescriptor::HiddenPrintfBuffer: {
address bufferPtr = printfDbg()->dbgBuffer();
- assert(arg->size_ == sizeof(bufferPtr) && "check the sizes");
- argPtr = addArg(argPtr, &bufferPtr, arg->size_, arg->alignment_);
+ if (printfEnabled &&
+ // and printf buffer was allocated
+ (bufferPtr != nullptr)) {
+ assert(it.size_ == sizeof(bufferPtr) && "check the sizes");
+ WriteAqlArgAt(const_cast(parameters), &bufferPtr, it.size_, it.offset_);
+ }
break;
}
- case ROC_ARGTYPE_QUEUE: {
- uint32_t index = signature.at(arg->index_).info_.arrayIndex_;
- const amd::DeviceQueue* queue = reinterpret_cast(parameters +
- kernelParams.samplerObjOffset())[index];
- if (queue == nullptr) {
- return false;
- }
-
- if (!createVirtualQueue(queue->size()) || !createSchedulerParam()) {
- return false;
- }
- gpuKernel.setDynamicParallelFlag(true);
- uint64_t vqVA = getVQVirtualAddress();
- argPtr = addArg(argPtr, &vqVA, arg->size_, arg->alignment_);
- break;
- }
- case ROC_ARGTYPE_HIDDEN_DEFAULT_QUEUE: {
-
+ case amd::KernelParameterDescriptor::HiddenDefaultQueue: {
amd::DeviceQueue* defQueue = kernel.program().context().defDeviceQueue(dev());
if (!createVirtualQueue(defQueue->size()) || !createSchedulerParam()) {
@@ -1927,156 +1996,28 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
}
gpuKernel.setDynamicParallelFlag(true);
uint64_t vqVA = getVQVirtualAddress();
- argPtr = addArg(argPtr, &vqVA, arg->size_, arg->alignment_);
+ WriteAqlArgAt(const_cast(parameters), &vqVA, it.size_, it.offset_);
break;
}
- case ROC_ARGTYPE_HIDDEN_COMPLETION_ACTION: {
-
+ case amd::KernelParameterDescriptor::HiddenCompletionAction: {
Memory* schedulerMem = dev().getRocMemory(schedulerParam_);
AmdAqlWrap* wrap = reinterpret_cast(reinterpret_cast(schedulerParam_->getHostMem()) + sizeof(SchedulerParam));
memset(wrap, 0, sizeof(AmdAqlWrap));
wrap->state = AQL_WRAP_DONE;
uint64_t spVA = reinterpret_cast(schedulerMem->getDeviceMemory()) + sizeof(SchedulerParam);
- argPtr = addArg(argPtr, &spVA, arg->size_, arg->alignment_);
+ WriteAqlArgAt(const_cast(parameters), &spVA, it.size_, it.offset_);
break;
}
- case ROC_ARGTYPE_HIDDEN_NONE: {
- void* zero = 0;
- assert(arg->size_ <= sizeof(zero) && "check the sizes");
- argPtr = addArg(argPtr, &zero, arg->size_, arg->alignment_);
- break;
- }
- case ROC_ARGTYPE_POINTER: {
- if (arg->addrQual_ == ROC_ADDRESS_LOCAL) {
- // Align the LDS on the alignment requirement of type pointed to
- ldsUsage = amd::alignUp(ldsUsage, arg->pointeeAlignment_);
- argPtr = addArg(argPtr, &ldsUsage, arg->size_, arg->alignment_);
- if (sizeof(uint64_t) == arg->size_) {
- ldsUsage += *reinterpret_cast(srcArgPtr);
- } else {
- ldsUsage += *reinterpret_cast(srcArgPtr);
- }
- break;
- }
- assert((arg->addrQual_ == ROC_ADDRESS_GLOBAL || arg->addrQual_ == ROC_ADDRESS_CONSTANT) &&
- "Unsupported address qualifier");
- argPtr = addArg(argPtr, srcArgPtr, arg->size_, arg->alignment_);
- uint32_t index = signature.at(arg->index_).info_.arrayIndex_;
- amd::Memory* mem = memories[index];
- if (mem == nullptr) {
- break;
- }
-
- const bool readOnly =
-#if defined(WITH_LIGHTNING_COMPILER)
- signature.at(arg->index_).typeQualifier_ == CL_KERNEL_ARG_TYPE_CONST ||
-#endif // defined(WITH_LIGHTNING_COMPILER)
- (mem->getMemFlags() & CL_MEM_READ_ONLY) != 0;
-
- if (!readOnly) {
- mem->signalWrite(&dev());
- }
- break;
- }
- case ROC_ARGTYPE_REFERENCE: {
- void* mem = allocKernArg(arg->size_, arg->alignment_);
- if (mem == nullptr) {
- LogError("Out of memory");
- return false;
- }
- memcpy(mem, srcArgPtr, arg->size_);
- argPtr = addArg(argPtr, &mem, sizeof(void*));
- break;
- }
- case ROC_ARGTYPE_VALUE:
- argPtr = addArg(argPtr, srcArgPtr, arg->size_, arg->alignment_);
- break;
- case ROC_ARGTYPE_IMAGE: {
- uint32_t index = signature.at(arg->index_).info_.arrayIndex_;
- amd::Memory* mem = memories[index];
- Image* image = static_cast(mem->getDeviceMemory(dev()));
- if (image == nullptr) {
- LogError("Kernel image argument is not an image object");
- return false;
- }
-
- if (dev().settings().enableImageHandle_) {
- const uint64_t image_srd = image->getHsaImageObject().handle;
- assert(amd::isMultipleOf(image_srd, sizeof(image_srd)));
- argPtr = addArg(argPtr, &image_srd, sizeof(image_srd));
- } else {
- // Image arguments are of size 48 bytes and are aligned to 16 bytes
- argPtr = addArg(argPtr, (void*)image->getHsaImageObject().handle, HSA_IMAGE_OBJECT_SIZE,
- HSA_IMAGE_OBJECT_ALIGNMENT);
- }
-
- const bool readOnly =
-#if defined(WITH_LIGHTNING_COMPILER)
- signature.at(arg->index_).accessQualifier_ == CL_KERNEL_ARG_ACCESS_READ_ONLY ||
-#endif // defined(WITH_LIGHTNING_COMPILER)
- mem->getMemFlags() & CL_MEM_READ_ONLY;
-
- if (!readOnly) {
- mem->signalWrite(&dev());
- }
- break;
- }
- case ROC_ARGTYPE_SAMPLER: {
- uint32_t index = signature.at(arg->index_).info_.arrayIndex_;
- const amd::Sampler* sampler = reinterpret_cast(parameters +
- kernelParams.samplerObjOffset())[index];
- if (sampler == nullptr) {
- LogError("Kernel sampler argument is not an sampler object");
- return false;
- }
-
- hsa_ext_sampler_descriptor_t samplerDescriptor;
- fillSampleDescriptor(samplerDescriptor, *sampler);
-
- hsa_ext_sampler_t hsa_sampler;
- hsa_status_t status =
- hsa_ext_sampler_create(dev().getBackendDevice(), &samplerDescriptor, &hsa_sampler);
- if (status != HSA_STATUS_SUCCESS) {
- // Wait on a kernel if one is outstanding
- releaseGpuMemoryFence();
- // Release the sampler handles allocated for the various
- // on one or more kernel submissions
- for (const auto& it: samplerList_) {
- if (hsa_ext_sampler_destroy(gpu_device_, it) != HSA_STATUS_SUCCESS) {
- LogWarning("Error destroying device sampler object!");
- }
- }
- samplerList_.clear();
-
- status = hsa_ext_sampler_create(dev().getBackendDevice(), &samplerDescriptor, &hsa_sampler);
- if (status != HSA_STATUS_SUCCESS) {
- LogError("Error creating device sampler object!");
- return false;
- }
- }
-
- if (dev().settings().enableImageHandle_) {
- uint64_t sampler_srd = hsa_sampler.handle;
- argPtr = addArg(argPtr, &sampler_srd, sizeof(sampler_srd));
- samplerList_.push_back(hsa_sampler);
- // TODO: destroy sampler.
- } else {
- argPtr = amd::alignUp(argPtr, HSA_SAMPLER_OBJECT_ALIGNMENT);
-
- memcpy(argPtr, (void*)hsa_sampler.handle, HSA_SAMPLER_OBJECT_SIZE);
- argPtr += HSA_SAMPLER_OBJECT_SIZE;
- hsa_ext_sampler_destroy(dev().getBackendDevice(), hsa_sampler);
- }
- break;
- }
- default:
- return false;
}
}
- // Check there is no arguments' buffer overflow
- assert(argPtr <= argBuffer + gpuKernel.KernargSegmentByteSize());
+ // Load all kernel arguments
+ WriteAqlArgAt(argBuffer, parameters, gpuKernel.KernargSegmentByteSize(), 0);
+ // Note: In a case of structs the size won't match,
+ // since HSAIL compiler expects a reference...
+ assert(gpuKernel.KernargSegmentByteSize() <= signature.paramsSize() &&
+ "A mismatch of sizes of arguments between compiler and runtime!");
// Check for group memory overflow
//! @todo Check should be in HSA - here we should have at most an assert
diff --git a/rocclr/runtime/device/rocm/rocvirtual.hpp b/rocclr/runtime/device/rocm/rocvirtual.hpp
index 520cc9f515..40758f8fd5 100644
--- a/rocclr/runtime/device/rocm/rocvirtual.hpp
+++ b/rocclr/runtime/device/rocm/rocvirtual.hpp
@@ -217,7 +217,8 @@ class VirtualGPU : public device::VirtualDevice {
//! Detects memory dependency for HSAIL kernels and uses appropriate AQL header
bool processMemObjects(const amd::Kernel& kernel, //!< AMD kernel object for execution
- const_address params //!< Pointer to the param's store
+ const_address params, //!< Pointer to the param's store
+ size_t& ldsAddress //!< LDS usage
);
// Retun the virtual gpu unique index
uint index() const { return index_; }
@@ -313,4 +314,34 @@ class VirtualGPU : public device::VirtualDevice {
};
};
+
+template
+inline void WriteAqlArgAt(
+ unsigned char* dst, //!< The write pointer to the buffer
+ const T* src, //!< The source pointer
+ uint size, //!< The size in bytes to copy
+ size_t offset //!< The alignment to follow while writing to the buffer
+) {
+ memcpy(dst + offset, src, size);
+}
+
+template <>
+inline void WriteAqlArgAt(
+ unsigned char* dst, //!< The write pointer to the buffer
+ const uint32_t* src, //!< The source pointer
+ uint size, //!< The size in bytes to copy
+ size_t offset //!< The alignment to follow while writing to the buffer
+) {
+ *(reinterpret_cast(dst + offset)) = *src;
+}
+
+template <>
+inline void WriteAqlArgAt(
+ unsigned char* dst, //!< The write pointer to the buffer
+ const uint64_t* src, //!< The source pointer
+ uint size, //!< The size in bytes to copy
+ size_t offset //!< The alignment to follow while writing to the buffer
+) {
+ *(reinterpret_cast(dst + offset)) = *src;
+}
}