P4 to Git Change 1589476 by axie@axie-rocm-opencl on 2018/08/03 15:54:24

SWDEV-79445 - OCL generic changes and code clean-up
	- Optimize setup of kernel arguments.
	- Add HW ABI support in the abstraction layer
	- Remove arguments parsing loop from the kernel launch. Memory processing will be responsible for dependency tracking and  patching of arguments.

	ReviewBoardURL = http://ocltc.amd.com/reviews/r/15400/

	Tests:
	1. ./run_conformance.py ./opencl_conformance_tests_reallyquick.csv CL_DEVICE_TYPE_GPU for openCL 1.2: OpenCL-GL sharing failed. This is not a regression.
	2. ./ocltst -m oclruntime.so -A oclruntime.exclude
	3. ./run_conformance.py opencl_conformance_tests_lightning.csv CL_DEVICE_TYPE_GPU : PASS
	4. teamcity test: http://ocltc.amd.com:8111/viewModification.html?modId=104598&personal=true&buildTypeId=&tab=vcsModificationBuilds&show_all_builds=true

Affected files ...

... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rockernel.cpp#39 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rockernel.hpp#23 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocsettings.cpp#34 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocsettings.hpp#14 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocvirtual.cpp#60 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocvirtual.hpp#17 edit
This commit is contained in:
foreman
2018-08-03 16:05:12 -04:00
rodzic 6d82075654
commit 9cd2db67f6
6 zmienionych plików z 356 dodań i 275 usunięć
+121 -13
Wyświetl plik
@@ -231,6 +231,37 @@ static inline ROC_ADDRESS_QUALIFIER GetKernelAddrQual(const aclArgData* argInfo)
return ROC_ADDRESS_ERROR;
}
inline static uint32_t GetOclArgumentType(const HSAILKernel::Argument* arg) {
switch (arg->type_){
case ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_X:
return amd::KernelParameterDescriptor::HiddenGlobalOffsetX;
case ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Y:
return amd::KernelParameterDescriptor::HiddenGlobalOffsetY;
case ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Z:
return amd::KernelParameterDescriptor::HiddenGlobalOffsetZ;
case ROC_ARGTYPE_HIDDEN_PRINTF_BUFFER:
return amd::KernelParameterDescriptor::HiddenPrintfBuffer;
case ROC_ARGTYPE_HIDDEN_DEFAULT_QUEUE:
return amd::KernelParameterDescriptor::HiddenDefaultQueue;
case ROC_ARGTYPE_HIDDEN_COMPLETION_ACTION:
return amd::KernelParameterDescriptor::HiddenCompletionAction;
case ROC_ARGTYPE_POINTER:
return amd::KernelParameterDescriptor::MemoryObject;
case ROC_ARGTYPE_IMAGE:
return amd::KernelParameterDescriptor::ImageObject;
case ROC_ARGTYPE_REFERENCE:
return amd::KernelParameterDescriptor::ReferenceObject;
case ROC_ARGTYPE_VALUE:
return amd::KernelParameterDescriptor::ValueObject;
case ROC_ARGTYPE_SAMPLER:
return amd::KernelParameterDescriptor::SamplerObject;
case ROC_ARGTYPE_QUEUE:
return amd::KernelParameterDescriptor::QueueObject;
default:
return amd::KernelParameterDescriptor::HiddenNone;
}
}
#if defined(WITH_LIGHTNING_COMPILER)
static inline ROC_DATA_TYPE GetKernelDataType(const KernelArgMD& lcArg) {
aclArgDataType dataType;
@@ -514,6 +545,8 @@ static inline cl_kernel_arg_type_qualifier GetOclTypeQual(const aclArgData* argI
#if defined(WITH_COMPILER_LIB)
void HSAILKernel::initArguments(const aclArgData* aclArg) {
device::Kernel::parameters_t params;
device::Kernel::parameters_t hiddenParams;
size_t offsetStruct = KernargSegmentByteSize();
// Iterate through the arguments and insert into parameterList
for (size_t offset = 0; aclArg->struct_size != 0; aclArg++) {
@@ -539,17 +572,27 @@ void HSAILKernel::initArguments(const aclArgData* aclArg) {
arg->index_ = isHidden ? uint(-1) : params.size();
hsailArgList_.push_back(arg);
amd::KernelParameterDescriptor desc;
// Allocate the hidden arguments, but abstraction layer will skip them
if (isHidden) {
offset = amd::alignUp(offset, arg->alignment_);
desc.offset_ = offset;
desc.size_ = arg->size_;
offset += arg->size_;
desc.info_.oclObject_ = GetOclArgumentType(arg);
hiddenParams.push_back(desc);
continue;
}
amd::KernelParameterDescriptor desc;
desc.name_ = arg->name_.c_str();
desc.type_ = GetOclType(arg);
desc.addressQualifier_ = GetOclAddrQual(arg);
desc.accessQualifier_ = GetOclAccessQual(arg);
desc.typeQualifier_ = GetOclTypeQual(aclArg);
desc.typeName_ = arg->typeName_.c_str();
desc.info_.oclObject_ = GetOclArgumentType(arg);
desc.info_.arrayIndex_ = arg->pointeeAlignment_;
// set image related flags
if (arg->type_ == ROC_ARGTYPE_IMAGE) {
@@ -566,19 +609,48 @@ void HSAILKernel::initArguments(const aclArgData* aclArg) {
// and CPU sends the parameters as they are allocated in memory
size_t size = desc.size_;
offset = amd::alignUp(offset, std::min(size, size_t(16)));
desc.offset_ = offset;
offset += amd::alignUp(size, sizeof(uint32_t));
// Check if HSAIL expects data by reference and allocate it behind
if (arg->type_ == ROC_ARGTYPE_REFERENCE) {
desc.offset_ = offsetStruct;
// Align the offset reference
offset = amd::alignUp(offset, sizeof(size_t));
patchReferences_.insert({desc.offset_, offset});
offsetStruct += size;
// Adjust the offset of arguments
offset += sizeof(size_t);
}
else if ((desc.info_.oclObject_ == amd::KernelParameterDescriptor::ImageObject) ||
(desc.info_.oclObject_ == amd::KernelParameterDescriptor::SamplerObject) ||
(desc.info_.oclObject_ == amd::KernelParameterDescriptor::QueueObject)) {
// These objects have forced data size to uint64_t
offset = amd::alignUp(offset, sizeof(uint64_t));
desc.offset_ = offset;
offset += sizeof(uint64_t);
} else {
offset = amd::alignUp(offset, arg->alignment_);
desc.offset_ = offset;
offset += size;
}
// Update read only flag
desc.info_.readOnly_ = (arg->access_ == ROC_ACCESS_TYPE_RO) ? true : false;
params.push_back(desc);
}
createSignature(params, params.size(), amd::KernelSignature::ABIVersion_0);
// Save the number of OCL arguments
uint32_t numParams = params.size();
// Append the hidden arguments to the OCL arguments
params.insert(params.end(), hiddenParams.begin(), hiddenParams.end());
createSignature(params, numParams, amd::KernelSignature::ABIVersion_1);
}
#endif // defined(WITH_COMPILER_LIB)
#if defined(WITH_LIGHTNING_COMPILER)
void LightningKernel::initArguments(const KernelMD& kernelMD) {
device::Kernel::parameters_t params;
device::Kernel::parameters_t hiddenParams;
size_t offsetStruct = KernargSegmentByteSize();
size_t offset = 0;
@@ -607,19 +679,27 @@ void LightningKernel::initArguments(const KernelMD& kernelMD) {
arg->index_ = isHidden ? uint(-1) : params.size();
hsailArgList_.push_back(arg);
if (isHidden) {
continue;
}
// Initialize Device kernel parameters
amd::KernelParameterDescriptor desc;
if (isHidden) {
offset = amd::alignUp(offset, arg->alignment_);
desc.offset_ = offset;
desc.size_ = arg->size_;
offset += arg->size_;
desc.info_.oclObject_ = GetOclArgumentType(arg);
hiddenParams.push_back(desc);
continue;
}
desc.name_ = lcArg.mName.c_str();
desc.type_ = GetOclType(arg);
desc.addressQualifier_ = GetOclAddrQual(arg);
desc.accessQualifier_ = GetOclAccessQual(arg);
desc.typeQualifier_ = GetOclTypeQual(lcArg);
desc.typeName_ = lcArg.mTypeName.c_str();
desc.info_.oclObject_ = GetOclArgumentType(arg);
desc.info_.arrayIndex_ = arg->pointeeAlignment_;
// set image related flags
if (arg->type_ == ROC_ARGTYPE_IMAGE) {
@@ -629,6 +709,7 @@ void LightningKernel::initArguments(const KernelMD& kernelMD) {
flags_.imageWrite_ = true;
}
}
desc.size_ = arg->size_;
// Make offset alignment to match CPU metadata, since
@@ -636,13 +717,40 @@ void LightningKernel::initArguments(const KernelMD& kernelMD) {
// and CPU sends the parameters as they are allocated in memory
size_t size = desc.size_;
offset = (size_t)amd::alignUp(offset, std::min(size, size_t(16)));
desc.offset_ = offset;
offset += amd::alignUp(size, sizeof(uint32_t));
// Check if HSAIL expects data by reference and allocate it behind
if (arg->type_ == ROC_ARGTYPE_REFERENCE) {
desc.offset_ = offsetStruct;
// Align the offset reference
offset = amd::alignUp(offset, sizeof(size_t));
patchReferences_.insert({desc.offset_, offset});
offsetStruct += size;
// Adjust the offset of arguments
offset += sizeof(size_t);
}
else if ((desc.info_.oclObject_ == amd::KernelParameterDescriptor::ImageObject) ||
(desc.info_.oclObject_ == amd::KernelParameterDescriptor::SamplerObject) ||
(desc.info_.oclObject_ == amd::KernelParameterDescriptor::QueueObject)) {
// These objects have forced data size to uint64_t
offset = amd::alignUp(offset, sizeof(uint64_t));
desc.offset_ = offset;
offset += sizeof(uint64_t);
} else {
offset = amd::alignUp(offset, arg->alignment_);
desc.offset_ = offset;
offset += size;
}
// Update read only flag
desc.info_.readOnly_ = (arg->access_ == ROC_ACCESS_TYPE_RO) ? true : false;
params.push_back(desc);
}
createSignature(params, params.size(), amd::KernelSignature::ABIVersion_0);
// Save the number of OCL arguments
uint32_t numParams = params.size();
// Append the hidden arguments to the OCL arguments
params.insert(params.end(), hiddenParams.begin(), hiddenParams.end());
createSignature(params, numParams, amd::KernelSignature::ABIVersion_1);
}
#endif // defined(WITH_LIGHTNING_COMPILER)
@@ -140,6 +140,8 @@ class Kernel : public device::Kernel {
//! Return TRUE if kernel wirtes images
bool imageWrite() const { return (flags_.imageWrite_) ? true : false; }
const std::unordered_map<size_t, size_t>& patch() const { return patchReferences_; }
protected:
union Flags {
struct {
@@ -162,6 +164,7 @@ class Kernel : public device::Kernel {
const uint32_t kernargSegmentAlignment_;
size_t kernelDirectiveOffset_;
std::vector<PrintfInfo> printf_;
std::unordered_map<size_t, size_t> patchReferences_; //!< Patch table for references
};
#if defined(WITH_COMPILER_LIB)
@@ -20,7 +20,6 @@ Settings::Settings() {
pollCompletion_ = ENVVAR_HSA_POLL_KERNEL_COMPLETION;
enableLocalMemory_ = HSA_LOCAL_MEMORY_ENABLE;
enableImageHandle_ = true;
maxWorkGroupSize_ = 1024;
preferredWorkGroupSize_ = 256;
@@ -22,7 +22,6 @@ class Settings : public device::Settings {
uint doublePrecision_ : 1; //!< Enables double precision support
uint pollCompletion_ : 1; //!< Enables polling in HSA
uint enableLocalMemory_ : 1; //!< Enable GPUVM memory
uint enableImageHandle_ : 1; //!< Use HSAIL image/sampler pointer
uint enableNCMode_ : 1; //!< Enable Non Coherent mode for system memory
uint enablePartialDispatch_ : 1; //!< Enable support for Partial Dispatch
uint imageDMA_ : 1; //!< Enable direct image DMA transfers
+200 -259
Wyświetl plik
@@ -185,8 +185,37 @@ void VirtualGPU::MemoryDependency::clear(bool all) {
}
}
bool VirtualGPU::processMemObjects(const amd::Kernel& kernel, const_address params) {
const Kernel& hsaKernel = static_cast<const Kernel&>(*(kernel.getDeviceKernel(dev())));
static void fillSampleDescriptor(hsa_ext_sampler_descriptor_t& samplerDescriptor,
const amd::Sampler& sampler) {
samplerDescriptor.filter_mode = sampler.filterMode() == CL_FILTER_NEAREST
? HSA_EXT_SAMPLER_FILTER_MODE_NEAREST
: HSA_EXT_SAMPLER_FILTER_MODE_LINEAR;
samplerDescriptor.coordinate_mode = sampler.normalizedCoords()
? HSA_EXT_SAMPLER_COORDINATE_MODE_NORMALIZED
: HSA_EXT_SAMPLER_COORDINATE_MODE_UNNORMALIZED;
switch (sampler.addressingMode()) {
case CL_ADDRESS_CLAMP_TO_EDGE:
samplerDescriptor.address_mode = HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE;
break;
case CL_ADDRESS_REPEAT:
samplerDescriptor.address_mode = HSA_EXT_SAMPLER_ADDRESSING_MODE_REPEAT;
break;
case CL_ADDRESS_CLAMP:
samplerDescriptor.address_mode = HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_BORDER;
break;
case CL_ADDRESS_MIRRORED_REPEAT:
samplerDescriptor.address_mode = HSA_EXT_SAMPLER_ADDRESSING_MODE_MIRRORED_REPEAT;
break;
case CL_ADDRESS_NONE:
samplerDescriptor.address_mode = HSA_EXT_SAMPLER_ADDRESSING_MODE_UNDEFINED;
break;
default:
return;
}
}
bool VirtualGPU::processMemObjects(const amd::Kernel& kernel, const_address params, size_t& ldsAddress) {
Kernel& hsaKernel = const_cast<Kernel&>(static_cast<const Kernel&>(*(kernel.getDeviceKernel(dev()))));
const amd::KernelSignature& signature = kernel.signature();
const amd::KernelParameters& kernelParams = kernel.parameters();
@@ -256,38 +285,141 @@ bool VirtualGPU::processMemObjects(const amd::Kernel& kernel, const_address para
// Check all parameters for the current kernel
for (size_t i = 0; i < signature.numParameters(); ++i) {
const amd::KernelParameterDescriptor& desc = signature.at(i);
const Kernel::Argument* arg = hsaKernel.hsailArgAt(i);
Memory* gpuMem = nullptr;
bool readOnly = false;
amd::Memory* mem = nullptr;
// Find if current argument is a buffer
if ((desc.type_ == T_POINTER) && (arg->addrQual_ != ROC_ADDRESS_LOCAL)) {
uint32_t index = desc.info_.arrayIndex_;
mem = memories[index];
if (mem != nullptr) {
gpuMem = static_cast<Memory*>(mem->getDeviceMemory(dev()));
// Don't sync for internal objects,
// since they are not shared between devices
if (gpuMem->owner()->getVirtualDevice() == nullptr) {
// Synchronize data with other memory instances if necessary
gpuMem->syncCacheFromHost(*this);
if (desc.type_ == T_POINTER) {
if (desc.addressQualifier_ == CL_KERNEL_ARG_ADDRESS_LOCAL) {
// Align the LDS on the alignment requirement of type pointed to
ldsAddress = amd::alignUp(ldsAddress, desc.info_.arrayIndex_);
if (desc.size_ == 8) {
// Save the original LDS size
uint64_t ldsSize = *reinterpret_cast<const uint64_t*>(params + desc.offset_);
// Patch the LDS address in the original arguments with an LDS address(offset)
WriteAqlArgAt(const_cast<address>(params), &ldsAddress, desc.size_, desc.offset_);
// Add the original size
ldsAddress += ldsSize;
} else {
// Save the original LDS size
uint32_t ldsSize = *reinterpret_cast<const uint32_t*>(params + desc.offset_);
// Patch the LDS address in the original arguments with an LDS address(offset)
uint32_t ldsAddr = ldsAddress;
WriteAqlArgAt(const_cast<address>(params), &ldsAddr, desc.size_, desc.offset_);
// Add the original size
ldsAddress += ldsSize;
}
}
//! This condition is for SVM fine-grain
if ((gpuMem == nullptr) && dev().isFineGrainedSystem(true)) {
// Sync AQL packets
setAqlHeader(kDispatchPacketHeader);
// Clear memory dependency state
const static bool All = true;
memoryDependency().clear(!All);
continue;
} else if (gpuMem != nullptr) {
readOnly |= (arg->access_ == ROC_ACCESS_TYPE_RO);
// Validate memory for a dependency in the queue
memoryDependency().validate(*this, gpuMem, readOnly);
else {
uint32_t index = desc.info_.arrayIndex_;
mem = memories[index];
if (mem == nullptr) {
//! This condition is for SVM fine-grain
if (dev().isFineGrainedSystem(true)) {
// Sync AQL packets
setAqlHeader(kDispatchPacketHeader);
// Clear memory dependency state
const static bool All = true;
memoryDependency().clear(!All);
}
}
else {
gpuMem = static_cast<Memory*>(mem->getDeviceMemory(dev()));
// Don't sync for internal objects,
// since they are not shared between devices
if (gpuMem->owner()->getVirtualDevice() == nullptr) {
// Synchronize data with other memory instances if necessary
gpuMem->syncCacheFromHost(*this);
}
// Validate memory for a dependency in the queue
memoryDependency().validate(*this, gpuMem, (desc.info_.readOnly_ == 1));
assert((desc.addressQualifier_ == CL_KERNEL_ARG_ADDRESS_GLOBAL ||
desc.addressQualifier_ == CL_KERNEL_ARG_ADDRESS_CONSTANT) &&
"Unsupported address qualifier");
const bool readOnly =
#if defined(WITH_LIGHTNING_COMPILER)
desc.typeQualifier_ == CL_KERNEL_ARG_TYPE_CONST ||
#endif // defined(WITH_LIGHTNING_COMPILER)
(mem->getMemFlags() & CL_MEM_READ_ONLY) != 0;
if (!readOnly) {
mem->signalWrite(&dev());
}
if (desc.info_.oclObject_ == amd::KernelParameterDescriptor::ImageObject) {
Image* image = static_cast<Image*>(mem->getDeviceMemory(dev()));
const uint64_t image_srd = image->getHsaImageObject().handle;
assert(amd::isMultipleOf(image_srd, sizeof(image_srd)));
WriteAqlArgAt(const_cast<address>(params), &image_srd, sizeof(image_srd), desc.offset_);
}
}
}
}
else if (desc.type_ == T_QUEUE) {
uint32_t index = desc.info_.arrayIndex_;
const amd::DeviceQueue* queue = reinterpret_cast<amd::DeviceQueue* const*>(
params + kernelParams.queueObjOffset())[index];
if (!createVirtualQueue(queue->size()) || !createSchedulerParam()) {
return false;
}
hsaKernel.setDynamicParallelFlag(true);
uint64_t vqVA = getVQVirtualAddress();
WriteAqlArgAt(const_cast<address>(params), &vqVA, sizeof(vqVA), desc.offset_);
}
else if (desc.type_ == T_VOID) {
if (desc.info_.oclObject_ == amd::KernelParameterDescriptor::ReferenceObject) {
const_address srcArgPtr = params + desc.offset_;
void* mem = allocKernArg(desc.size_, 128);
if (mem == nullptr) {
LogError("Out of memory");
return false;
}
memcpy(mem, srcArgPtr, desc.size_);
const auto it = hsaKernel.patch().find(desc.offset_);
WriteAqlArgAt(const_cast<address>(params), &mem, sizeof(void*), it->second);
}
}
else if (desc.type_ == T_SAMPLER) {
uint32_t index = desc.info_.arrayIndex_;
const amd::Sampler* sampler = reinterpret_cast<amd::Sampler* const*>(params +
kernelParams.samplerObjOffset())[index];
hsa_ext_sampler_descriptor_t samplerDescriptor;
fillSampleDescriptor(samplerDescriptor, *sampler);
hsa_ext_sampler_t hsa_sampler;
hsa_status_t status =
hsa_ext_sampler_create(dev().getBackendDevice(), &samplerDescriptor, &hsa_sampler);
if (status != HSA_STATUS_SUCCESS) {
// Wait on a kernel if one is outstanding
releaseGpuMemoryFence();
// Release the sampler handles allocated for the various
// on one or more kernel submissions
for (const auto& it: samplerList_) {
if (hsa_ext_sampler_destroy(gpu_device_, it) != HSA_STATUS_SUCCESS) {
LogWarning("Error destroying device sampler object!");
}
}
samplerList_.clear();
status = hsa_ext_sampler_create(dev().getBackendDevice(), &samplerDescriptor, &hsa_sampler);
if (status != HSA_STATUS_SUCCESS) {
LogError("Error creating device sampler object!");
return false;
}
}
uint64_t sampler_srd = hsa_sampler.handle;
WriteAqlArgAt(const_cast<address>(params), &sampler_srd, sizeof(sampler_srd), desc.offset_);
samplerList_.push_back(hsa_sampler);
// TODO: destroy sampler.
}
}
if (hsaKernel.program()->hasGlobalStores()) {
@@ -1438,26 +1570,6 @@ void VirtualGPU::submitMigrateMemObjects(amd::MigrateMemObjectsCommand& vcmd) {
profilingEnd(vcmd);
}
/*! \brief Writes to the buffer and increments the write pointer to the
* buffer. Also, ensures that the argument is written to an
* aligned memory as specified. Return the new write pointer.
*
* @param dst The write pointer to the buffer
* @param src The source pointer
* @param size The size in bytes to copy
* @param alignment The alignment to follow while writing to the buffer
*/
static inline address addArg(address dst, const void* src, size_t size, uint32_t alignment) {
dst = amd::alignUp(dst, alignment);
::memcpy(dst, src, size);
return dst + size;
}
static inline address addArg(address dst, const void* src, size_t size) {
assert(size < UINT32_MAX);
return addArg(dst, src, size, size);
}
// Over rides the workgroup size fields in the packet with runtime/compiler set sizes
void setRuntimeCompilerLocalSize(hsa_kernel_dispatch_packet_t& dispatchPacket,
amd::NDRangeContainer sizes, device::Kernel* devKernel,
@@ -1584,35 +1696,6 @@ void setRuntimeCompilerLocalSize(hsa_kernel_dispatch_packet_t& dispatchPacket,
}
}
static void fillSampleDescriptor(hsa_ext_sampler_descriptor_t& samplerDescriptor,
const amd::Sampler& sampler) {
samplerDescriptor.filter_mode = sampler.filterMode() == CL_FILTER_NEAREST
? HSA_EXT_SAMPLER_FILTER_MODE_NEAREST
: HSA_EXT_SAMPLER_FILTER_MODE_LINEAR;
samplerDescriptor.coordinate_mode = sampler.normalizedCoords()
? HSA_EXT_SAMPLER_COORDINATE_MODE_NORMALIZED
: HSA_EXT_SAMPLER_COORDINATE_MODE_UNNORMALIZED;
switch (sampler.addressingMode()) {
case CL_ADDRESS_CLAMP_TO_EDGE:
samplerDescriptor.address_mode = HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE;
break;
case CL_ADDRESS_REPEAT:
samplerDescriptor.address_mode = HSA_EXT_SAMPLER_ADDRESSING_MODE_REPEAT;
break;
case CL_ADDRESS_CLAMP:
samplerDescriptor.address_mode = HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_BORDER;
break;
case CL_ADDRESS_MIRRORED_REPEAT:
samplerDescriptor.address_mode = HSA_EXT_SAMPLER_ADDRESSING_MODE_MIRRORED_REPEAT;
break;
case CL_ADDRESS_NONE:
samplerDescriptor.address_mode = HSA_EXT_SAMPLER_ADDRESSING_MODE_UNDEFINED;
break;
default:
return;
}
}
bool VirtualGPU::createSchedulerParam()
{
if (nullptr != schedulerParam_) {
@@ -1797,12 +1880,10 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
const_address parameters, void* eventHandle) {
device::Kernel* devKernel = const_cast<device::Kernel*>(kernel.getDeviceKernel(dev()));
Kernel& gpuKernel = static_cast<Kernel&>(*devKernel);
const size_t compilerLdsUsage = gpuKernel.WorkgroupGroupSegmentByteSize();
size_t ldsUsage = compilerLdsUsage;
size_t ldsUsage = gpuKernel.WorkgroupGroupSegmentByteSize();
// Check memory dependency and SVM objects
if (!processMemObjects(kernel, parameters)) {
if (!processMemObjects(kernel, parameters, ldsUsage)) {
LogError("Wrong memory objects!");
return false;
}
@@ -1868,58 +1949,46 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
return false;
}
address argPtr = argBuffer;
for (auto arg : gpuKernel.hsailArgs()) {
const_address srcArgPtr = nullptr;
if (arg->index_ != uint(-1)) {
srcArgPtr = parameters + signature.at(arg->index_).offset_;
}
// Handle the hidden arguments first, as they do not have a
// matching parameter in the OCL signature (not a valid arg->index_)
switch (arg->type_) {
case ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_X: {
size_t offset_x = sizes.dimensions() >= 1 ? newOffset[0] : 0;
assert(arg->size_ == sizeof(offset_x) && "check the sizes");
argPtr = addArg(argPtr, &offset_x, arg->size_, arg->alignment_);
// Check if runtime has to setup hidden arguments
for (uint32_t i = signature.numParameters(); i < signature.numParametersAll(); ++i) {
const auto it = signature.at(i);
size_t offset;
switch (it.info_.oclObject_) {
case amd::KernelParameterDescriptor::HiddenNone:
break;
case amd::KernelParameterDescriptor::HiddenGlobalOffsetX: {
offset = newOffset[0];
assert(it.size_ == sizeof(offset) && "check the sizes");
WriteAqlArgAt(const_cast<address>(parameters), &offset, it.size_, it.offset_);
break;
}
case ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Y: {
size_t offset_y = sizes.dimensions() >= 2 ? newOffset[1] : 0;
assert(arg->size_ == sizeof(offset_y) && "check the sizes");
argPtr = addArg(argPtr, &offset_y, arg->size_, arg->alignment_);
case amd::KernelParameterDescriptor::HiddenGlobalOffsetY: {
if (sizes.dimensions() >= 2) {
offset = newOffset[1];
assert(it.size_ == sizeof(offset) && "check the sizes");
WriteAqlArgAt(const_cast<address>(parameters), &offset, it.size_, it.offset_);
}
break;
}
case ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Z: {
size_t offset_z = sizes.dimensions() == 3 ? newOffset[2] : 0;
assert(arg->size_ == sizeof(offset_z) && "check the sizes");
argPtr = addArg(argPtr, &offset_z, arg->size_, arg->alignment_);
case amd::KernelParameterDescriptor::HiddenGlobalOffsetZ: {
if (sizes.dimensions() >= 3) {
offset = newOffset[2];
assert(it.size_ == sizeof(offset) && "check the sizes");
WriteAqlArgAt(const_cast<address>(parameters), &offset, it.size_, it.offset_);
}
break;
}
case ROC_ARGTYPE_HIDDEN_PRINTF_BUFFER: {
case amd::KernelParameterDescriptor::HiddenPrintfBuffer: {
address bufferPtr = printfDbg()->dbgBuffer();
assert(arg->size_ == sizeof(bufferPtr) && "check the sizes");
argPtr = addArg(argPtr, &bufferPtr, arg->size_, arg->alignment_);
if (printfEnabled &&
// and printf buffer was allocated
(bufferPtr != nullptr)) {
assert(it.size_ == sizeof(bufferPtr) && "check the sizes");
WriteAqlArgAt(const_cast<address>(parameters), &bufferPtr, it.size_, it.offset_);
}
break;
}
case ROC_ARGTYPE_QUEUE: {
uint32_t index = signature.at(arg->index_).info_.arrayIndex_;
const amd::DeviceQueue* queue = reinterpret_cast<amd::DeviceQueue* const*>(parameters +
kernelParams.samplerObjOffset())[index];
if (queue == nullptr) {
return false;
}
if (!createVirtualQueue(queue->size()) || !createSchedulerParam()) {
return false;
}
gpuKernel.setDynamicParallelFlag(true);
uint64_t vqVA = getVQVirtualAddress();
argPtr = addArg(argPtr, &vqVA, arg->size_, arg->alignment_);
break;
}
case ROC_ARGTYPE_HIDDEN_DEFAULT_QUEUE: {
case amd::KernelParameterDescriptor::HiddenDefaultQueue: {
amd::DeviceQueue* defQueue = kernel.program().context().defDeviceQueue(dev());
if (!createVirtualQueue(defQueue->size()) || !createSchedulerParam()) {
@@ -1927,156 +1996,28 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
}
gpuKernel.setDynamicParallelFlag(true);
uint64_t vqVA = getVQVirtualAddress();
argPtr = addArg(argPtr, &vqVA, arg->size_, arg->alignment_);
WriteAqlArgAt(const_cast<address>(parameters), &vqVA, it.size_, it.offset_);
break;
}
case ROC_ARGTYPE_HIDDEN_COMPLETION_ACTION: {
case amd::KernelParameterDescriptor::HiddenCompletionAction: {
Memory* schedulerMem = dev().getRocMemory(schedulerParam_);
AmdAqlWrap* wrap = reinterpret_cast<AmdAqlWrap*>(reinterpret_cast<uint64_t>(schedulerParam_->getHostMem()) + sizeof(SchedulerParam));
memset(wrap, 0, sizeof(AmdAqlWrap));
wrap->state = AQL_WRAP_DONE;
uint64_t spVA = reinterpret_cast<uint64_t>(schedulerMem->getDeviceMemory()) + sizeof(SchedulerParam);
argPtr = addArg(argPtr, &spVA, arg->size_, arg->alignment_);
WriteAqlArgAt(const_cast<address>(parameters), &spVA, it.size_, it.offset_);
break;
}
case ROC_ARGTYPE_HIDDEN_NONE: {
void* zero = 0;
assert(arg->size_ <= sizeof(zero) && "check the sizes");
argPtr = addArg(argPtr, &zero, arg->size_, arg->alignment_);
break;
}
case ROC_ARGTYPE_POINTER: {
if (arg->addrQual_ == ROC_ADDRESS_LOCAL) {
// Align the LDS on the alignment requirement of type pointed to
ldsUsage = amd::alignUp(ldsUsage, arg->pointeeAlignment_);
argPtr = addArg(argPtr, &ldsUsage, arg->size_, arg->alignment_);
if (sizeof(uint64_t) == arg->size_) {
ldsUsage += *reinterpret_cast<const uint64_t*>(srcArgPtr);
} else {
ldsUsage += *reinterpret_cast<const uint32_t*>(srcArgPtr);
}
break;
}
assert((arg->addrQual_ == ROC_ADDRESS_GLOBAL || arg->addrQual_ == ROC_ADDRESS_CONSTANT) &&
"Unsupported address qualifier");
argPtr = addArg(argPtr, srcArgPtr, arg->size_, arg->alignment_);
uint32_t index = signature.at(arg->index_).info_.arrayIndex_;
amd::Memory* mem = memories[index];
if (mem == nullptr) {
break;
}
const bool readOnly =
#if defined(WITH_LIGHTNING_COMPILER)
signature.at(arg->index_).typeQualifier_ == CL_KERNEL_ARG_TYPE_CONST ||
#endif // defined(WITH_LIGHTNING_COMPILER)
(mem->getMemFlags() & CL_MEM_READ_ONLY) != 0;
if (!readOnly) {
mem->signalWrite(&dev());
}
break;
}
case ROC_ARGTYPE_REFERENCE: {
void* mem = allocKernArg(arg->size_, arg->alignment_);
if (mem == nullptr) {
LogError("Out of memory");
return false;
}
memcpy(mem, srcArgPtr, arg->size_);
argPtr = addArg(argPtr, &mem, sizeof(void*));
break;
}
case ROC_ARGTYPE_VALUE:
argPtr = addArg(argPtr, srcArgPtr, arg->size_, arg->alignment_);
break;
case ROC_ARGTYPE_IMAGE: {
uint32_t index = signature.at(arg->index_).info_.arrayIndex_;
amd::Memory* mem = memories[index];
Image* image = static_cast<Image*>(mem->getDeviceMemory(dev()));
if (image == nullptr) {
LogError("Kernel image argument is not an image object");
return false;
}
if (dev().settings().enableImageHandle_) {
const uint64_t image_srd = image->getHsaImageObject().handle;
assert(amd::isMultipleOf(image_srd, sizeof(image_srd)));
argPtr = addArg(argPtr, &image_srd, sizeof(image_srd));
} else {
// Image arguments are of size 48 bytes and are aligned to 16 bytes
argPtr = addArg(argPtr, (void*)image->getHsaImageObject().handle, HSA_IMAGE_OBJECT_SIZE,
HSA_IMAGE_OBJECT_ALIGNMENT);
}
const bool readOnly =
#if defined(WITH_LIGHTNING_COMPILER)
signature.at(arg->index_).accessQualifier_ == CL_KERNEL_ARG_ACCESS_READ_ONLY ||
#endif // defined(WITH_LIGHTNING_COMPILER)
mem->getMemFlags() & CL_MEM_READ_ONLY;
if (!readOnly) {
mem->signalWrite(&dev());
}
break;
}
case ROC_ARGTYPE_SAMPLER: {
uint32_t index = signature.at(arg->index_).info_.arrayIndex_;
const amd::Sampler* sampler = reinterpret_cast<amd::Sampler* const*>(parameters +
kernelParams.samplerObjOffset())[index];
if (sampler == nullptr) {
LogError("Kernel sampler argument is not an sampler object");
return false;
}
hsa_ext_sampler_descriptor_t samplerDescriptor;
fillSampleDescriptor(samplerDescriptor, *sampler);
hsa_ext_sampler_t hsa_sampler;
hsa_status_t status =
hsa_ext_sampler_create(dev().getBackendDevice(), &samplerDescriptor, &hsa_sampler);
if (status != HSA_STATUS_SUCCESS) {
// Wait on a kernel if one is outstanding
releaseGpuMemoryFence();
// Release the sampler handles allocated for the various
// on one or more kernel submissions
for (const auto& it: samplerList_) {
if (hsa_ext_sampler_destroy(gpu_device_, it) != HSA_STATUS_SUCCESS) {
LogWarning("Error destroying device sampler object!");
}
}
samplerList_.clear();
status = hsa_ext_sampler_create(dev().getBackendDevice(), &samplerDescriptor, &hsa_sampler);
if (status != HSA_STATUS_SUCCESS) {
LogError("Error creating device sampler object!");
return false;
}
}
if (dev().settings().enableImageHandle_) {
uint64_t sampler_srd = hsa_sampler.handle;
argPtr = addArg(argPtr, &sampler_srd, sizeof(sampler_srd));
samplerList_.push_back(hsa_sampler);
// TODO: destroy sampler.
} else {
argPtr = amd::alignUp(argPtr, HSA_SAMPLER_OBJECT_ALIGNMENT);
memcpy(argPtr, (void*)hsa_sampler.handle, HSA_SAMPLER_OBJECT_SIZE);
argPtr += HSA_SAMPLER_OBJECT_SIZE;
hsa_ext_sampler_destroy(dev().getBackendDevice(), hsa_sampler);
}
break;
}
default:
return false;
}
}
// Check there is no arguments' buffer overflow
assert(argPtr <= argBuffer + gpuKernel.KernargSegmentByteSize());
// Load all kernel arguments
WriteAqlArgAt(argBuffer, parameters, gpuKernel.KernargSegmentByteSize(), 0);
// Note: In a case of structs the size won't match,
// since HSAIL compiler expects a reference...
assert(gpuKernel.KernargSegmentByteSize() <= signature.paramsSize() &&
"A mismatch of sizes of arguments between compiler and runtime!");
// Check for group memory overflow
//! @todo Check should be in HSA - here we should have at most an assert
+32 -1
Wyświetl plik
@@ -217,7 +217,8 @@ class VirtualGPU : public device::VirtualDevice {
//! Detects memory dependency for HSAIL kernels and uses appropriate AQL header
bool processMemObjects(const amd::Kernel& kernel, //!< AMD kernel object for execution
const_address params //!< Pointer to the param's store
const_address params, //!< Pointer to the param's store
size_t& ldsAddress //!< LDS usage
);
// Retun the virtual gpu unique index
uint index() const { return index_; }
@@ -313,4 +314,34 @@ class VirtualGPU : public device::VirtualDevice {
};
};
template <typename T>
inline void WriteAqlArgAt(
unsigned char* dst, //!< The write pointer to the buffer
const T* src, //!< The source pointer
uint size, //!< The size in bytes to copy
size_t offset //!< The alignment to follow while writing to the buffer
) {
memcpy(dst + offset, src, size);
}
template <>
inline void WriteAqlArgAt(
unsigned char* dst, //!< The write pointer to the buffer
const uint32_t* src, //!< The source pointer
uint size, //!< The size in bytes to copy
size_t offset //!< The alignment to follow while writing to the buffer
) {
*(reinterpret_cast<uint32_t*>(dst + offset)) = *src;
}
template <>
inline void WriteAqlArgAt(
unsigned char* dst, //!< The write pointer to the buffer
const uint64_t* src, //!< The source pointer
uint size, //!< The size in bytes to copy
size_t offset //!< The alignment to follow while writing to the buffer
) {
*(reinterpret_cast<uint64_t*>(dst + offset)) = *src;
}
}