P4 to Git Change 1589476 by axie@axie-rocm-opencl on 2018/08/03 15:54:24
SWDEV-79445 - OCL generic changes and code clean-up - Optimize setup of kernel arguments. - Add HW ABI support in the abstraction layer - Remove arguments parsing loop from the kernel launch. Memory processing will be responsible for dependency tracking and patching of arguments. ReviewBoardURL = http://ocltc.amd.com/reviews/r/15400/ Tests: 1. ./run_conformance.py ./opencl_conformance_tests_reallyquick.csv CL_DEVICE_TYPE_GPU for openCL 1.2: OpenCL-GL sharing failed. This is not a regression. 2. ./ocltst -m oclruntime.so -A oclruntime.exclude 3. ./run_conformance.py opencl_conformance_tests_lightning.csv CL_DEVICE_TYPE_GPU : PASS 4. teamcity test: http://ocltc.amd.com:8111/viewModification.html?modId=104598&personal=true&buildTypeId=&tab=vcsModificationBuilds&show_all_builds=true Affected files ... ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rockernel.cpp#39 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rockernel.hpp#23 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocsettings.cpp#34 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocsettings.hpp#14 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocvirtual.cpp#60 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocvirtual.hpp#17 edit
This commit is contained in:
@@ -231,6 +231,37 @@ static inline ROC_ADDRESS_QUALIFIER GetKernelAddrQual(const aclArgData* argInfo)
|
||||
return ROC_ADDRESS_ERROR;
|
||||
}
|
||||
|
||||
inline static uint32_t GetOclArgumentType(const HSAILKernel::Argument* arg) {
|
||||
switch (arg->type_){
|
||||
case ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_X:
|
||||
return amd::KernelParameterDescriptor::HiddenGlobalOffsetX;
|
||||
case ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Y:
|
||||
return amd::KernelParameterDescriptor::HiddenGlobalOffsetY;
|
||||
case ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Z:
|
||||
return amd::KernelParameterDescriptor::HiddenGlobalOffsetZ;
|
||||
case ROC_ARGTYPE_HIDDEN_PRINTF_BUFFER:
|
||||
return amd::KernelParameterDescriptor::HiddenPrintfBuffer;
|
||||
case ROC_ARGTYPE_HIDDEN_DEFAULT_QUEUE:
|
||||
return amd::KernelParameterDescriptor::HiddenDefaultQueue;
|
||||
case ROC_ARGTYPE_HIDDEN_COMPLETION_ACTION:
|
||||
return amd::KernelParameterDescriptor::HiddenCompletionAction;
|
||||
case ROC_ARGTYPE_POINTER:
|
||||
return amd::KernelParameterDescriptor::MemoryObject;
|
||||
case ROC_ARGTYPE_IMAGE:
|
||||
return amd::KernelParameterDescriptor::ImageObject;
|
||||
case ROC_ARGTYPE_REFERENCE:
|
||||
return amd::KernelParameterDescriptor::ReferenceObject;
|
||||
case ROC_ARGTYPE_VALUE:
|
||||
return amd::KernelParameterDescriptor::ValueObject;
|
||||
case ROC_ARGTYPE_SAMPLER:
|
||||
return amd::KernelParameterDescriptor::SamplerObject;
|
||||
case ROC_ARGTYPE_QUEUE:
|
||||
return amd::KernelParameterDescriptor::QueueObject;
|
||||
default:
|
||||
return amd::KernelParameterDescriptor::HiddenNone;
|
||||
}
|
||||
}
|
||||
|
||||
#if defined(WITH_LIGHTNING_COMPILER)
|
||||
static inline ROC_DATA_TYPE GetKernelDataType(const KernelArgMD& lcArg) {
|
||||
aclArgDataType dataType;
|
||||
@@ -514,6 +545,8 @@ static inline cl_kernel_arg_type_qualifier GetOclTypeQual(const aclArgData* argI
|
||||
#if defined(WITH_COMPILER_LIB)
|
||||
void HSAILKernel::initArguments(const aclArgData* aclArg) {
|
||||
device::Kernel::parameters_t params;
|
||||
device::Kernel::parameters_t hiddenParams;
|
||||
size_t offsetStruct = KernargSegmentByteSize();
|
||||
|
||||
// Iterate through the arguments and insert into parameterList
|
||||
for (size_t offset = 0; aclArg->struct_size != 0; aclArg++) {
|
||||
@@ -539,17 +572,27 @@ void HSAILKernel::initArguments(const aclArgData* aclArg) {
|
||||
arg->index_ = isHidden ? uint(-1) : params.size();
|
||||
hsailArgList_.push_back(arg);
|
||||
|
||||
amd::KernelParameterDescriptor desc;
|
||||
|
||||
// Allocate the hidden arguments, but abstraction layer will skip them
|
||||
if (isHidden) {
|
||||
offset = amd::alignUp(offset, arg->alignment_);
|
||||
desc.offset_ = offset;
|
||||
desc.size_ = arg->size_;
|
||||
offset += arg->size_;
|
||||
desc.info_.oclObject_ = GetOclArgumentType(arg);
|
||||
hiddenParams.push_back(desc);
|
||||
continue;
|
||||
}
|
||||
|
||||
amd::KernelParameterDescriptor desc;
|
||||
desc.name_ = arg->name_.c_str();
|
||||
desc.type_ = GetOclType(arg);
|
||||
desc.addressQualifier_ = GetOclAddrQual(arg);
|
||||
desc.accessQualifier_ = GetOclAccessQual(arg);
|
||||
desc.typeQualifier_ = GetOclTypeQual(aclArg);
|
||||
desc.typeName_ = arg->typeName_.c_str();
|
||||
desc.info_.oclObject_ = GetOclArgumentType(arg);
|
||||
desc.info_.arrayIndex_ = arg->pointeeAlignment_;
|
||||
|
||||
// set image related flags
|
||||
if (arg->type_ == ROC_ARGTYPE_IMAGE) {
|
||||
@@ -566,19 +609,48 @@ void HSAILKernel::initArguments(const aclArgData* aclArg) {
|
||||
// and CPU sends the parameters as they are allocated in memory
|
||||
size_t size = desc.size_;
|
||||
|
||||
offset = amd::alignUp(offset, std::min(size, size_t(16)));
|
||||
desc.offset_ = offset;
|
||||
offset += amd::alignUp(size, sizeof(uint32_t));
|
||||
// Check if HSAIL expects data by reference and allocate it behind
|
||||
if (arg->type_ == ROC_ARGTYPE_REFERENCE) {
|
||||
desc.offset_ = offsetStruct;
|
||||
// Align the offset reference
|
||||
offset = amd::alignUp(offset, sizeof(size_t));
|
||||
patchReferences_.insert({desc.offset_, offset});
|
||||
offsetStruct += size;
|
||||
// Adjust the offset of arguments
|
||||
offset += sizeof(size_t);
|
||||
}
|
||||
else if ((desc.info_.oclObject_ == amd::KernelParameterDescriptor::ImageObject) ||
|
||||
(desc.info_.oclObject_ == amd::KernelParameterDescriptor::SamplerObject) ||
|
||||
(desc.info_.oclObject_ == amd::KernelParameterDescriptor::QueueObject)) {
|
||||
// These objects have forced data size to uint64_t
|
||||
offset = amd::alignUp(offset, sizeof(uint64_t));
|
||||
desc.offset_ = offset;
|
||||
offset += sizeof(uint64_t);
|
||||
} else {
|
||||
offset = amd::alignUp(offset, arg->alignment_);
|
||||
desc.offset_ = offset;
|
||||
offset += size;
|
||||
}
|
||||
|
||||
// Update read only flag
|
||||
desc.info_.readOnly_ = (arg->access_ == ROC_ACCESS_TYPE_RO) ? true : false;
|
||||
|
||||
params.push_back(desc);
|
||||
}
|
||||
createSignature(params, params.size(), amd::KernelSignature::ABIVersion_0);
|
||||
|
||||
// Save the number of OCL arguments
|
||||
uint32_t numParams = params.size();
|
||||
// Append the hidden arguments to the OCL arguments
|
||||
params.insert(params.end(), hiddenParams.begin(), hiddenParams.end());
|
||||
createSignature(params, numParams, amd::KernelSignature::ABIVersion_1);
|
||||
}
|
||||
#endif // defined(WITH_COMPILER_LIB)
|
||||
|
||||
#if defined(WITH_LIGHTNING_COMPILER)
|
||||
void LightningKernel::initArguments(const KernelMD& kernelMD) {
|
||||
device::Kernel::parameters_t params;
|
||||
device::Kernel::parameters_t hiddenParams;
|
||||
size_t offsetStruct = KernargSegmentByteSize();
|
||||
|
||||
size_t offset = 0;
|
||||
|
||||
@@ -607,19 +679,27 @@ void LightningKernel::initArguments(const KernelMD& kernelMD) {
|
||||
arg->index_ = isHidden ? uint(-1) : params.size();
|
||||
hsailArgList_.push_back(arg);
|
||||
|
||||
if (isHidden) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Initialize Device kernel parameters
|
||||
amd::KernelParameterDescriptor desc;
|
||||
|
||||
if (isHidden) {
|
||||
offset = amd::alignUp(offset, arg->alignment_);
|
||||
desc.offset_ = offset;
|
||||
desc.size_ = arg->size_;
|
||||
offset += arg->size_;
|
||||
desc.info_.oclObject_ = GetOclArgumentType(arg);
|
||||
hiddenParams.push_back(desc);
|
||||
continue;
|
||||
}
|
||||
|
||||
desc.name_ = lcArg.mName.c_str();
|
||||
desc.type_ = GetOclType(arg);
|
||||
desc.addressQualifier_ = GetOclAddrQual(arg);
|
||||
desc.accessQualifier_ = GetOclAccessQual(arg);
|
||||
desc.typeQualifier_ = GetOclTypeQual(lcArg);
|
||||
desc.typeName_ = lcArg.mTypeName.c_str();
|
||||
desc.info_.oclObject_ = GetOclArgumentType(arg);
|
||||
desc.info_.arrayIndex_ = arg->pointeeAlignment_;
|
||||
|
||||
// set image related flags
|
||||
if (arg->type_ == ROC_ARGTYPE_IMAGE) {
|
||||
@@ -629,6 +709,7 @@ void LightningKernel::initArguments(const KernelMD& kernelMD) {
|
||||
flags_.imageWrite_ = true;
|
||||
}
|
||||
}
|
||||
|
||||
desc.size_ = arg->size_;
|
||||
|
||||
// Make offset alignment to match CPU metadata, since
|
||||
@@ -636,13 +717,40 @@ void LightningKernel::initArguments(const KernelMD& kernelMD) {
|
||||
// and CPU sends the parameters as they are allocated in memory
|
||||
size_t size = desc.size_;
|
||||
|
||||
offset = (size_t)amd::alignUp(offset, std::min(size, size_t(16)));
|
||||
desc.offset_ = offset;
|
||||
offset += amd::alignUp(size, sizeof(uint32_t));
|
||||
// Check if HSAIL expects data by reference and allocate it behind
|
||||
if (arg->type_ == ROC_ARGTYPE_REFERENCE) {
|
||||
desc.offset_ = offsetStruct;
|
||||
// Align the offset reference
|
||||
offset = amd::alignUp(offset, sizeof(size_t));
|
||||
patchReferences_.insert({desc.offset_, offset});
|
||||
offsetStruct += size;
|
||||
// Adjust the offset of arguments
|
||||
offset += sizeof(size_t);
|
||||
}
|
||||
else if ((desc.info_.oclObject_ == amd::KernelParameterDescriptor::ImageObject) ||
|
||||
(desc.info_.oclObject_ == amd::KernelParameterDescriptor::SamplerObject) ||
|
||||
(desc.info_.oclObject_ == amd::KernelParameterDescriptor::QueueObject)) {
|
||||
// These objects have forced data size to uint64_t
|
||||
offset = amd::alignUp(offset, sizeof(uint64_t));
|
||||
desc.offset_ = offset;
|
||||
offset += sizeof(uint64_t);
|
||||
} else {
|
||||
offset = amd::alignUp(offset, arg->alignment_);
|
||||
desc.offset_ = offset;
|
||||
offset += size;
|
||||
}
|
||||
|
||||
// Update read only flag
|
||||
desc.info_.readOnly_ = (arg->access_ == ROC_ACCESS_TYPE_RO) ? true : false;
|
||||
|
||||
params.push_back(desc);
|
||||
}
|
||||
createSignature(params, params.size(), amd::KernelSignature::ABIVersion_0);
|
||||
|
||||
// Save the number of OCL arguments
|
||||
uint32_t numParams = params.size();
|
||||
// Append the hidden arguments to the OCL arguments
|
||||
params.insert(params.end(), hiddenParams.begin(), hiddenParams.end());
|
||||
createSignature(params, numParams, amd::KernelSignature::ABIVersion_1);
|
||||
}
|
||||
#endif // defined(WITH_LIGHTNING_COMPILER)
|
||||
|
||||
|
||||
@@ -140,6 +140,8 @@ class Kernel : public device::Kernel {
|
||||
//! Return TRUE if kernel wirtes images
|
||||
bool imageWrite() const { return (flags_.imageWrite_) ? true : false; }
|
||||
|
||||
const std::unordered_map<size_t, size_t>& patch() const { return patchReferences_; }
|
||||
|
||||
protected:
|
||||
union Flags {
|
||||
struct {
|
||||
@@ -162,6 +164,7 @@ class Kernel : public device::Kernel {
|
||||
const uint32_t kernargSegmentAlignment_;
|
||||
size_t kernelDirectiveOffset_;
|
||||
std::vector<PrintfInfo> printf_;
|
||||
std::unordered_map<size_t, size_t> patchReferences_; //!< Patch table for references
|
||||
};
|
||||
|
||||
#if defined(WITH_COMPILER_LIB)
|
||||
|
||||
@@ -20,7 +20,6 @@ Settings::Settings() {
|
||||
pollCompletion_ = ENVVAR_HSA_POLL_KERNEL_COMPLETION;
|
||||
|
||||
enableLocalMemory_ = HSA_LOCAL_MEMORY_ENABLE;
|
||||
enableImageHandle_ = true;
|
||||
|
||||
maxWorkGroupSize_ = 1024;
|
||||
preferredWorkGroupSize_ = 256;
|
||||
|
||||
@@ -22,7 +22,6 @@ class Settings : public device::Settings {
|
||||
uint doublePrecision_ : 1; //!< Enables double precision support
|
||||
uint pollCompletion_ : 1; //!< Enables polling in HSA
|
||||
uint enableLocalMemory_ : 1; //!< Enable GPUVM memory
|
||||
uint enableImageHandle_ : 1; //!< Use HSAIL image/sampler pointer
|
||||
uint enableNCMode_ : 1; //!< Enable Non Coherent mode for system memory
|
||||
uint enablePartialDispatch_ : 1; //!< Enable support for Partial Dispatch
|
||||
uint imageDMA_ : 1; //!< Enable direct image DMA transfers
|
||||
|
||||
@@ -185,8 +185,37 @@ void VirtualGPU::MemoryDependency::clear(bool all) {
|
||||
}
|
||||
}
|
||||
|
||||
bool VirtualGPU::processMemObjects(const amd::Kernel& kernel, const_address params) {
|
||||
const Kernel& hsaKernel = static_cast<const Kernel&>(*(kernel.getDeviceKernel(dev())));
|
||||
static void fillSampleDescriptor(hsa_ext_sampler_descriptor_t& samplerDescriptor,
|
||||
const amd::Sampler& sampler) {
|
||||
samplerDescriptor.filter_mode = sampler.filterMode() == CL_FILTER_NEAREST
|
||||
? HSA_EXT_SAMPLER_FILTER_MODE_NEAREST
|
||||
: HSA_EXT_SAMPLER_FILTER_MODE_LINEAR;
|
||||
samplerDescriptor.coordinate_mode = sampler.normalizedCoords()
|
||||
? HSA_EXT_SAMPLER_COORDINATE_MODE_NORMALIZED
|
||||
: HSA_EXT_SAMPLER_COORDINATE_MODE_UNNORMALIZED;
|
||||
switch (sampler.addressingMode()) {
|
||||
case CL_ADDRESS_CLAMP_TO_EDGE:
|
||||
samplerDescriptor.address_mode = HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE;
|
||||
break;
|
||||
case CL_ADDRESS_REPEAT:
|
||||
samplerDescriptor.address_mode = HSA_EXT_SAMPLER_ADDRESSING_MODE_REPEAT;
|
||||
break;
|
||||
case CL_ADDRESS_CLAMP:
|
||||
samplerDescriptor.address_mode = HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_BORDER;
|
||||
break;
|
||||
case CL_ADDRESS_MIRRORED_REPEAT:
|
||||
samplerDescriptor.address_mode = HSA_EXT_SAMPLER_ADDRESSING_MODE_MIRRORED_REPEAT;
|
||||
break;
|
||||
case CL_ADDRESS_NONE:
|
||||
samplerDescriptor.address_mode = HSA_EXT_SAMPLER_ADDRESSING_MODE_UNDEFINED;
|
||||
break;
|
||||
default:
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
bool VirtualGPU::processMemObjects(const amd::Kernel& kernel, const_address params, size_t& ldsAddress) {
|
||||
Kernel& hsaKernel = const_cast<Kernel&>(static_cast<const Kernel&>(*(kernel.getDeviceKernel(dev()))));
|
||||
const amd::KernelSignature& signature = kernel.signature();
|
||||
const amd::KernelParameters& kernelParams = kernel.parameters();
|
||||
|
||||
@@ -256,38 +285,141 @@ bool VirtualGPU::processMemObjects(const amd::Kernel& kernel, const_address para
|
||||
// Check all parameters for the current kernel
|
||||
for (size_t i = 0; i < signature.numParameters(); ++i) {
|
||||
const amd::KernelParameterDescriptor& desc = signature.at(i);
|
||||
const Kernel::Argument* arg = hsaKernel.hsailArgAt(i);
|
||||
Memory* gpuMem = nullptr;
|
||||
bool readOnly = false;
|
||||
amd::Memory* mem = nullptr;
|
||||
|
||||
// Find if current argument is a buffer
|
||||
if ((desc.type_ == T_POINTER) && (arg->addrQual_ != ROC_ADDRESS_LOCAL)) {
|
||||
uint32_t index = desc.info_.arrayIndex_;
|
||||
mem = memories[index];
|
||||
if (mem != nullptr) {
|
||||
gpuMem = static_cast<Memory*>(mem->getDeviceMemory(dev()));
|
||||
// Don't sync for internal objects,
|
||||
// since they are not shared between devices
|
||||
if (gpuMem->owner()->getVirtualDevice() == nullptr) {
|
||||
// Synchronize data with other memory instances if necessary
|
||||
gpuMem->syncCacheFromHost(*this);
|
||||
if (desc.type_ == T_POINTER) {
|
||||
if (desc.addressQualifier_ == CL_KERNEL_ARG_ADDRESS_LOCAL) {
|
||||
// Align the LDS on the alignment requirement of type pointed to
|
||||
ldsAddress = amd::alignUp(ldsAddress, desc.info_.arrayIndex_);
|
||||
if (desc.size_ == 8) {
|
||||
// Save the original LDS size
|
||||
uint64_t ldsSize = *reinterpret_cast<const uint64_t*>(params + desc.offset_);
|
||||
// Patch the LDS address in the original arguments with an LDS address(offset)
|
||||
WriteAqlArgAt(const_cast<address>(params), &ldsAddress, desc.size_, desc.offset_);
|
||||
// Add the original size
|
||||
ldsAddress += ldsSize;
|
||||
} else {
|
||||
// Save the original LDS size
|
||||
uint32_t ldsSize = *reinterpret_cast<const uint32_t*>(params + desc.offset_);
|
||||
// Patch the LDS address in the original arguments with an LDS address(offset)
|
||||
uint32_t ldsAddr = ldsAddress;
|
||||
WriteAqlArgAt(const_cast<address>(params), &ldsAddr, desc.size_, desc.offset_);
|
||||
// Add the original size
|
||||
ldsAddress += ldsSize;
|
||||
}
|
||||
}
|
||||
//! This condition is for SVM fine-grain
|
||||
if ((gpuMem == nullptr) && dev().isFineGrainedSystem(true)) {
|
||||
// Sync AQL packets
|
||||
setAqlHeader(kDispatchPacketHeader);
|
||||
// Clear memory dependency state
|
||||
const static bool All = true;
|
||||
memoryDependency().clear(!All);
|
||||
continue;
|
||||
} else if (gpuMem != nullptr) {
|
||||
readOnly |= (arg->access_ == ROC_ACCESS_TYPE_RO);
|
||||
// Validate memory for a dependency in the queue
|
||||
memoryDependency().validate(*this, gpuMem, readOnly);
|
||||
else {
|
||||
uint32_t index = desc.info_.arrayIndex_;
|
||||
mem = memories[index];
|
||||
if (mem == nullptr) {
|
||||
//! This condition is for SVM fine-grain
|
||||
if (dev().isFineGrainedSystem(true)) {
|
||||
// Sync AQL packets
|
||||
setAqlHeader(kDispatchPacketHeader);
|
||||
// Clear memory dependency state
|
||||
const static bool All = true;
|
||||
memoryDependency().clear(!All);
|
||||
}
|
||||
}
|
||||
else {
|
||||
gpuMem = static_cast<Memory*>(mem->getDeviceMemory(dev()));
|
||||
// Don't sync for internal objects,
|
||||
// since they are not shared between devices
|
||||
if (gpuMem->owner()->getVirtualDevice() == nullptr) {
|
||||
// Synchronize data with other memory instances if necessary
|
||||
gpuMem->syncCacheFromHost(*this);
|
||||
}
|
||||
|
||||
// Validate memory for a dependency in the queue
|
||||
memoryDependency().validate(*this, gpuMem, (desc.info_.readOnly_ == 1));
|
||||
|
||||
assert((desc.addressQualifier_ == CL_KERNEL_ARG_ADDRESS_GLOBAL ||
|
||||
desc.addressQualifier_ == CL_KERNEL_ARG_ADDRESS_CONSTANT) &&
|
||||
"Unsupported address qualifier");
|
||||
|
||||
const bool readOnly =
|
||||
#if defined(WITH_LIGHTNING_COMPILER)
|
||||
desc.typeQualifier_ == CL_KERNEL_ARG_TYPE_CONST ||
|
||||
#endif // defined(WITH_LIGHTNING_COMPILER)
|
||||
(mem->getMemFlags() & CL_MEM_READ_ONLY) != 0;
|
||||
|
||||
if (!readOnly) {
|
||||
mem->signalWrite(&dev());
|
||||
}
|
||||
|
||||
if (desc.info_.oclObject_ == amd::KernelParameterDescriptor::ImageObject) {
|
||||
Image* image = static_cast<Image*>(mem->getDeviceMemory(dev()));
|
||||
|
||||
const uint64_t image_srd = image->getHsaImageObject().handle;
|
||||
assert(amd::isMultipleOf(image_srd, sizeof(image_srd)));
|
||||
WriteAqlArgAt(const_cast<address>(params), &image_srd, sizeof(image_srd), desc.offset_);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (desc.type_ == T_QUEUE) {
|
||||
uint32_t index = desc.info_.arrayIndex_;
|
||||
const amd::DeviceQueue* queue = reinterpret_cast<amd::DeviceQueue* const*>(
|
||||
params + kernelParams.queueObjOffset())[index];
|
||||
|
||||
if (!createVirtualQueue(queue->size()) || !createSchedulerParam()) {
|
||||
return false;
|
||||
}
|
||||
hsaKernel.setDynamicParallelFlag(true);
|
||||
uint64_t vqVA = getVQVirtualAddress();
|
||||
WriteAqlArgAt(const_cast<address>(params), &vqVA, sizeof(vqVA), desc.offset_);
|
||||
}
|
||||
else if (desc.type_ == T_VOID) {
|
||||
if (desc.info_.oclObject_ == amd::KernelParameterDescriptor::ReferenceObject) {
|
||||
const_address srcArgPtr = params + desc.offset_;
|
||||
void* mem = allocKernArg(desc.size_, 128);
|
||||
if (mem == nullptr) {
|
||||
LogError("Out of memory");
|
||||
return false;
|
||||
}
|
||||
memcpy(mem, srcArgPtr, desc.size_);
|
||||
const auto it = hsaKernel.patch().find(desc.offset_);
|
||||
WriteAqlArgAt(const_cast<address>(params), &mem, sizeof(void*), it->second);
|
||||
}
|
||||
}
|
||||
else if (desc.type_ == T_SAMPLER) {
|
||||
uint32_t index = desc.info_.arrayIndex_;
|
||||
const amd::Sampler* sampler = reinterpret_cast<amd::Sampler* const*>(params +
|
||||
kernelParams.samplerObjOffset())[index];
|
||||
|
||||
hsa_ext_sampler_descriptor_t samplerDescriptor;
|
||||
fillSampleDescriptor(samplerDescriptor, *sampler);
|
||||
|
||||
hsa_ext_sampler_t hsa_sampler;
|
||||
hsa_status_t status =
|
||||
hsa_ext_sampler_create(dev().getBackendDevice(), &samplerDescriptor, &hsa_sampler);
|
||||
|
||||
if (status != HSA_STATUS_SUCCESS) {
|
||||
// Wait on a kernel if one is outstanding
|
||||
releaseGpuMemoryFence();
|
||||
// Release the sampler handles allocated for the various
|
||||
// on one or more kernel submissions
|
||||
for (const auto& it: samplerList_) {
|
||||
if (hsa_ext_sampler_destroy(gpu_device_, it) != HSA_STATUS_SUCCESS) {
|
||||
LogWarning("Error destroying device sampler object!");
|
||||
}
|
||||
}
|
||||
|
||||
samplerList_.clear();
|
||||
status = hsa_ext_sampler_create(dev().getBackendDevice(), &samplerDescriptor, &hsa_sampler);
|
||||
if (status != HSA_STATUS_SUCCESS) {
|
||||
LogError("Error creating device sampler object!");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
uint64_t sampler_srd = hsa_sampler.handle;
|
||||
WriteAqlArgAt(const_cast<address>(params), &sampler_srd, sizeof(sampler_srd), desc.offset_);
|
||||
samplerList_.push_back(hsa_sampler);
|
||||
// TODO: destroy sampler.
|
||||
}
|
||||
}
|
||||
|
||||
if (hsaKernel.program()->hasGlobalStores()) {
|
||||
@@ -1438,26 +1570,6 @@ void VirtualGPU::submitMigrateMemObjects(amd::MigrateMemObjectsCommand& vcmd) {
|
||||
profilingEnd(vcmd);
|
||||
}
|
||||
|
||||
/*! \brief Writes to the buffer and increments the write pointer to the
|
||||
* buffer. Also, ensures that the argument is written to an
|
||||
* aligned memory as specified. Return the new write pointer.
|
||||
*
|
||||
* @param dst The write pointer to the buffer
|
||||
* @param src The source pointer
|
||||
* @param size The size in bytes to copy
|
||||
* @param alignment The alignment to follow while writing to the buffer
|
||||
*/
|
||||
static inline address addArg(address dst, const void* src, size_t size, uint32_t alignment) {
|
||||
dst = amd::alignUp(dst, alignment);
|
||||
::memcpy(dst, src, size);
|
||||
return dst + size;
|
||||
}
|
||||
|
||||
static inline address addArg(address dst, const void* src, size_t size) {
|
||||
assert(size < UINT32_MAX);
|
||||
return addArg(dst, src, size, size);
|
||||
}
|
||||
|
||||
// Over rides the workgroup size fields in the packet with runtime/compiler set sizes
|
||||
void setRuntimeCompilerLocalSize(hsa_kernel_dispatch_packet_t& dispatchPacket,
|
||||
amd::NDRangeContainer sizes, device::Kernel* devKernel,
|
||||
@@ -1584,35 +1696,6 @@ void setRuntimeCompilerLocalSize(hsa_kernel_dispatch_packet_t& dispatchPacket,
|
||||
}
|
||||
}
|
||||
|
||||
static void fillSampleDescriptor(hsa_ext_sampler_descriptor_t& samplerDescriptor,
|
||||
const amd::Sampler& sampler) {
|
||||
samplerDescriptor.filter_mode = sampler.filterMode() == CL_FILTER_NEAREST
|
||||
? HSA_EXT_SAMPLER_FILTER_MODE_NEAREST
|
||||
: HSA_EXT_SAMPLER_FILTER_MODE_LINEAR;
|
||||
samplerDescriptor.coordinate_mode = sampler.normalizedCoords()
|
||||
? HSA_EXT_SAMPLER_COORDINATE_MODE_NORMALIZED
|
||||
: HSA_EXT_SAMPLER_COORDINATE_MODE_UNNORMALIZED;
|
||||
switch (sampler.addressingMode()) {
|
||||
case CL_ADDRESS_CLAMP_TO_EDGE:
|
||||
samplerDescriptor.address_mode = HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE;
|
||||
break;
|
||||
case CL_ADDRESS_REPEAT:
|
||||
samplerDescriptor.address_mode = HSA_EXT_SAMPLER_ADDRESSING_MODE_REPEAT;
|
||||
break;
|
||||
case CL_ADDRESS_CLAMP:
|
||||
samplerDescriptor.address_mode = HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_BORDER;
|
||||
break;
|
||||
case CL_ADDRESS_MIRRORED_REPEAT:
|
||||
samplerDescriptor.address_mode = HSA_EXT_SAMPLER_ADDRESSING_MODE_MIRRORED_REPEAT;
|
||||
break;
|
||||
case CL_ADDRESS_NONE:
|
||||
samplerDescriptor.address_mode = HSA_EXT_SAMPLER_ADDRESSING_MODE_UNDEFINED;
|
||||
break;
|
||||
default:
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
bool VirtualGPU::createSchedulerParam()
|
||||
{
|
||||
if (nullptr != schedulerParam_) {
|
||||
@@ -1797,12 +1880,10 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
|
||||
const_address parameters, void* eventHandle) {
|
||||
device::Kernel* devKernel = const_cast<device::Kernel*>(kernel.getDeviceKernel(dev()));
|
||||
Kernel& gpuKernel = static_cast<Kernel&>(*devKernel);
|
||||
|
||||
const size_t compilerLdsUsage = gpuKernel.WorkgroupGroupSegmentByteSize();
|
||||
size_t ldsUsage = compilerLdsUsage;
|
||||
size_t ldsUsage = gpuKernel.WorkgroupGroupSegmentByteSize();
|
||||
|
||||
// Check memory dependency and SVM objects
|
||||
if (!processMemObjects(kernel, parameters)) {
|
||||
if (!processMemObjects(kernel, parameters, ldsUsage)) {
|
||||
LogError("Wrong memory objects!");
|
||||
return false;
|
||||
}
|
||||
@@ -1868,58 +1949,46 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
|
||||
return false;
|
||||
}
|
||||
|
||||
address argPtr = argBuffer;
|
||||
for (auto arg : gpuKernel.hsailArgs()) {
|
||||
const_address srcArgPtr = nullptr;
|
||||
if (arg->index_ != uint(-1)) {
|
||||
srcArgPtr = parameters + signature.at(arg->index_).offset_;
|
||||
}
|
||||
|
||||
// Handle the hidden arguments first, as they do not have a
|
||||
// matching parameter in the OCL signature (not a valid arg->index_)
|
||||
switch (arg->type_) {
|
||||
case ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_X: {
|
||||
size_t offset_x = sizes.dimensions() >= 1 ? newOffset[0] : 0;
|
||||
assert(arg->size_ == sizeof(offset_x) && "check the sizes");
|
||||
argPtr = addArg(argPtr, &offset_x, arg->size_, arg->alignment_);
|
||||
// Check if runtime has to setup hidden arguments
|
||||
for (uint32_t i = signature.numParameters(); i < signature.numParametersAll(); ++i) {
|
||||
const auto it = signature.at(i);
|
||||
size_t offset;
|
||||
switch (it.info_.oclObject_) {
|
||||
case amd::KernelParameterDescriptor::HiddenNone:
|
||||
break;
|
||||
case amd::KernelParameterDescriptor::HiddenGlobalOffsetX: {
|
||||
offset = newOffset[0];
|
||||
assert(it.size_ == sizeof(offset) && "check the sizes");
|
||||
WriteAqlArgAt(const_cast<address>(parameters), &offset, it.size_, it.offset_);
|
||||
break;
|
||||
}
|
||||
case ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Y: {
|
||||
size_t offset_y = sizes.dimensions() >= 2 ? newOffset[1] : 0;
|
||||
assert(arg->size_ == sizeof(offset_y) && "check the sizes");
|
||||
argPtr = addArg(argPtr, &offset_y, arg->size_, arg->alignment_);
|
||||
case amd::KernelParameterDescriptor::HiddenGlobalOffsetY: {
|
||||
if (sizes.dimensions() >= 2) {
|
||||
offset = newOffset[1];
|
||||
assert(it.size_ == sizeof(offset) && "check the sizes");
|
||||
WriteAqlArgAt(const_cast<address>(parameters), &offset, it.size_, it.offset_);
|
||||
}
|
||||
break;
|
||||
}
|
||||
case ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Z: {
|
||||
size_t offset_z = sizes.dimensions() == 3 ? newOffset[2] : 0;
|
||||
assert(arg->size_ == sizeof(offset_z) && "check the sizes");
|
||||
argPtr = addArg(argPtr, &offset_z, arg->size_, arg->alignment_);
|
||||
case amd::KernelParameterDescriptor::HiddenGlobalOffsetZ: {
|
||||
if (sizes.dimensions() >= 3) {
|
||||
offset = newOffset[2];
|
||||
assert(it.size_ == sizeof(offset) && "check the sizes");
|
||||
WriteAqlArgAt(const_cast<address>(parameters), &offset, it.size_, it.offset_);
|
||||
}
|
||||
break;
|
||||
}
|
||||
case ROC_ARGTYPE_HIDDEN_PRINTF_BUFFER: {
|
||||
case amd::KernelParameterDescriptor::HiddenPrintfBuffer: {
|
||||
address bufferPtr = printfDbg()->dbgBuffer();
|
||||
assert(arg->size_ == sizeof(bufferPtr) && "check the sizes");
|
||||
argPtr = addArg(argPtr, &bufferPtr, arg->size_, arg->alignment_);
|
||||
if (printfEnabled &&
|
||||
// and printf buffer was allocated
|
||||
(bufferPtr != nullptr)) {
|
||||
assert(it.size_ == sizeof(bufferPtr) && "check the sizes");
|
||||
WriteAqlArgAt(const_cast<address>(parameters), &bufferPtr, it.size_, it.offset_);
|
||||
}
|
||||
break;
|
||||
}
|
||||
case ROC_ARGTYPE_QUEUE: {
|
||||
uint32_t index = signature.at(arg->index_).info_.arrayIndex_;
|
||||
const amd::DeviceQueue* queue = reinterpret_cast<amd::DeviceQueue* const*>(parameters +
|
||||
kernelParams.samplerObjOffset())[index];
|
||||
if (queue == nullptr) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!createVirtualQueue(queue->size()) || !createSchedulerParam()) {
|
||||
return false;
|
||||
}
|
||||
gpuKernel.setDynamicParallelFlag(true);
|
||||
uint64_t vqVA = getVQVirtualAddress();
|
||||
argPtr = addArg(argPtr, &vqVA, arg->size_, arg->alignment_);
|
||||
break;
|
||||
}
|
||||
case ROC_ARGTYPE_HIDDEN_DEFAULT_QUEUE: {
|
||||
|
||||
case amd::KernelParameterDescriptor::HiddenDefaultQueue: {
|
||||
amd::DeviceQueue* defQueue = kernel.program().context().defDeviceQueue(dev());
|
||||
|
||||
if (!createVirtualQueue(defQueue->size()) || !createSchedulerParam()) {
|
||||
@@ -1927,156 +1996,28 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
|
||||
}
|
||||
gpuKernel.setDynamicParallelFlag(true);
|
||||
uint64_t vqVA = getVQVirtualAddress();
|
||||
argPtr = addArg(argPtr, &vqVA, arg->size_, arg->alignment_);
|
||||
WriteAqlArgAt(const_cast<address>(parameters), &vqVA, it.size_, it.offset_);
|
||||
break;
|
||||
}
|
||||
case ROC_ARGTYPE_HIDDEN_COMPLETION_ACTION: {
|
||||
|
||||
case amd::KernelParameterDescriptor::HiddenCompletionAction: {
|
||||
Memory* schedulerMem = dev().getRocMemory(schedulerParam_);
|
||||
AmdAqlWrap* wrap = reinterpret_cast<AmdAqlWrap*>(reinterpret_cast<uint64_t>(schedulerParam_->getHostMem()) + sizeof(SchedulerParam));
|
||||
memset(wrap, 0, sizeof(AmdAqlWrap));
|
||||
wrap->state = AQL_WRAP_DONE;
|
||||
|
||||
uint64_t spVA = reinterpret_cast<uint64_t>(schedulerMem->getDeviceMemory()) + sizeof(SchedulerParam);
|
||||
argPtr = addArg(argPtr, &spVA, arg->size_, arg->alignment_);
|
||||
WriteAqlArgAt(const_cast<address>(parameters), &spVA, it.size_, it.offset_);
|
||||
break;
|
||||
}
|
||||
case ROC_ARGTYPE_HIDDEN_NONE: {
|
||||
void* zero = 0;
|
||||
assert(arg->size_ <= sizeof(zero) && "check the sizes");
|
||||
argPtr = addArg(argPtr, &zero, arg->size_, arg->alignment_);
|
||||
break;
|
||||
}
|
||||
case ROC_ARGTYPE_POINTER: {
|
||||
if (arg->addrQual_ == ROC_ADDRESS_LOCAL) {
|
||||
// Align the LDS on the alignment requirement of type pointed to
|
||||
ldsUsage = amd::alignUp(ldsUsage, arg->pointeeAlignment_);
|
||||
argPtr = addArg(argPtr, &ldsUsage, arg->size_, arg->alignment_);
|
||||
if (sizeof(uint64_t) == arg->size_) {
|
||||
ldsUsage += *reinterpret_cast<const uint64_t*>(srcArgPtr);
|
||||
} else {
|
||||
ldsUsage += *reinterpret_cast<const uint32_t*>(srcArgPtr);
|
||||
}
|
||||
break;
|
||||
}
|
||||
assert((arg->addrQual_ == ROC_ADDRESS_GLOBAL || arg->addrQual_ == ROC_ADDRESS_CONSTANT) &&
|
||||
"Unsupported address qualifier");
|
||||
argPtr = addArg(argPtr, srcArgPtr, arg->size_, arg->alignment_);
|
||||
uint32_t index = signature.at(arg->index_).info_.arrayIndex_;
|
||||
amd::Memory* mem = memories[index];
|
||||
if (mem == nullptr) {
|
||||
break;
|
||||
}
|
||||
|
||||
const bool readOnly =
|
||||
#if defined(WITH_LIGHTNING_COMPILER)
|
||||
signature.at(arg->index_).typeQualifier_ == CL_KERNEL_ARG_TYPE_CONST ||
|
||||
#endif // defined(WITH_LIGHTNING_COMPILER)
|
||||
(mem->getMemFlags() & CL_MEM_READ_ONLY) != 0;
|
||||
|
||||
if (!readOnly) {
|
||||
mem->signalWrite(&dev());
|
||||
}
|
||||
break;
|
||||
}
|
||||
case ROC_ARGTYPE_REFERENCE: {
|
||||
void* mem = allocKernArg(arg->size_, arg->alignment_);
|
||||
if (mem == nullptr) {
|
||||
LogError("Out of memory");
|
||||
return false;
|
||||
}
|
||||
memcpy(mem, srcArgPtr, arg->size_);
|
||||
argPtr = addArg(argPtr, &mem, sizeof(void*));
|
||||
break;
|
||||
}
|
||||
case ROC_ARGTYPE_VALUE:
|
||||
argPtr = addArg(argPtr, srcArgPtr, arg->size_, arg->alignment_);
|
||||
break;
|
||||
case ROC_ARGTYPE_IMAGE: {
|
||||
uint32_t index = signature.at(arg->index_).info_.arrayIndex_;
|
||||
amd::Memory* mem = memories[index];
|
||||
Image* image = static_cast<Image*>(mem->getDeviceMemory(dev()));
|
||||
if (image == nullptr) {
|
||||
LogError("Kernel image argument is not an image object");
|
||||
return false;
|
||||
}
|
||||
|
||||
if (dev().settings().enableImageHandle_) {
|
||||
const uint64_t image_srd = image->getHsaImageObject().handle;
|
||||
assert(amd::isMultipleOf(image_srd, sizeof(image_srd)));
|
||||
argPtr = addArg(argPtr, &image_srd, sizeof(image_srd));
|
||||
} else {
|
||||
// Image arguments are of size 48 bytes and are aligned to 16 bytes
|
||||
argPtr = addArg(argPtr, (void*)image->getHsaImageObject().handle, HSA_IMAGE_OBJECT_SIZE,
|
||||
HSA_IMAGE_OBJECT_ALIGNMENT);
|
||||
}
|
||||
|
||||
const bool readOnly =
|
||||
#if defined(WITH_LIGHTNING_COMPILER)
|
||||
signature.at(arg->index_).accessQualifier_ == CL_KERNEL_ARG_ACCESS_READ_ONLY ||
|
||||
#endif // defined(WITH_LIGHTNING_COMPILER)
|
||||
mem->getMemFlags() & CL_MEM_READ_ONLY;
|
||||
|
||||
if (!readOnly) {
|
||||
mem->signalWrite(&dev());
|
||||
}
|
||||
break;
|
||||
}
|
||||
case ROC_ARGTYPE_SAMPLER: {
|
||||
uint32_t index = signature.at(arg->index_).info_.arrayIndex_;
|
||||
const amd::Sampler* sampler = reinterpret_cast<amd::Sampler* const*>(parameters +
|
||||
kernelParams.samplerObjOffset())[index];
|
||||
if (sampler == nullptr) {
|
||||
LogError("Kernel sampler argument is not an sampler object");
|
||||
return false;
|
||||
}
|
||||
|
||||
hsa_ext_sampler_descriptor_t samplerDescriptor;
|
||||
fillSampleDescriptor(samplerDescriptor, *sampler);
|
||||
|
||||
hsa_ext_sampler_t hsa_sampler;
|
||||
hsa_status_t status =
|
||||
hsa_ext_sampler_create(dev().getBackendDevice(), &samplerDescriptor, &hsa_sampler);
|
||||
if (status != HSA_STATUS_SUCCESS) {
|
||||
// Wait on a kernel if one is outstanding
|
||||
releaseGpuMemoryFence();
|
||||
// Release the sampler handles allocated for the various
|
||||
// on one or more kernel submissions
|
||||
for (const auto& it: samplerList_) {
|
||||
if (hsa_ext_sampler_destroy(gpu_device_, it) != HSA_STATUS_SUCCESS) {
|
||||
LogWarning("Error destroying device sampler object!");
|
||||
}
|
||||
}
|
||||
samplerList_.clear();
|
||||
|
||||
status = hsa_ext_sampler_create(dev().getBackendDevice(), &samplerDescriptor, &hsa_sampler);
|
||||
if (status != HSA_STATUS_SUCCESS) {
|
||||
LogError("Error creating device sampler object!");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
if (dev().settings().enableImageHandle_) {
|
||||
uint64_t sampler_srd = hsa_sampler.handle;
|
||||
argPtr = addArg(argPtr, &sampler_srd, sizeof(sampler_srd));
|
||||
samplerList_.push_back(hsa_sampler);
|
||||
// TODO: destroy sampler.
|
||||
} else {
|
||||
argPtr = amd::alignUp(argPtr, HSA_SAMPLER_OBJECT_ALIGNMENT);
|
||||
|
||||
memcpy(argPtr, (void*)hsa_sampler.handle, HSA_SAMPLER_OBJECT_SIZE);
|
||||
argPtr += HSA_SAMPLER_OBJECT_SIZE;
|
||||
hsa_ext_sampler_destroy(dev().getBackendDevice(), hsa_sampler);
|
||||
}
|
||||
break;
|
||||
}
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Check there is no arguments' buffer overflow
|
||||
assert(argPtr <= argBuffer + gpuKernel.KernargSegmentByteSize());
|
||||
// Load all kernel arguments
|
||||
WriteAqlArgAt(argBuffer, parameters, gpuKernel.KernargSegmentByteSize(), 0);
|
||||
// Note: In a case of structs the size won't match,
|
||||
// since HSAIL compiler expects a reference...
|
||||
assert(gpuKernel.KernargSegmentByteSize() <= signature.paramsSize() &&
|
||||
"A mismatch of sizes of arguments between compiler and runtime!");
|
||||
|
||||
// Check for group memory overflow
|
||||
//! @todo Check should be in HSA - here we should have at most an assert
|
||||
|
||||
@@ -217,7 +217,8 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
|
||||
//! Detects memory dependency for HSAIL kernels and uses appropriate AQL header
|
||||
bool processMemObjects(const amd::Kernel& kernel, //!< AMD kernel object for execution
|
||||
const_address params //!< Pointer to the param's store
|
||||
const_address params, //!< Pointer to the param's store
|
||||
size_t& ldsAddress //!< LDS usage
|
||||
);
|
||||
// Retun the virtual gpu unique index
|
||||
uint index() const { return index_; }
|
||||
@@ -313,4 +314,34 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
};
|
||||
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
inline void WriteAqlArgAt(
|
||||
unsigned char* dst, //!< The write pointer to the buffer
|
||||
const T* src, //!< The source pointer
|
||||
uint size, //!< The size in bytes to copy
|
||||
size_t offset //!< The alignment to follow while writing to the buffer
|
||||
) {
|
||||
memcpy(dst + offset, src, size);
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void WriteAqlArgAt(
|
||||
unsigned char* dst, //!< The write pointer to the buffer
|
||||
const uint32_t* src, //!< The source pointer
|
||||
uint size, //!< The size in bytes to copy
|
||||
size_t offset //!< The alignment to follow while writing to the buffer
|
||||
) {
|
||||
*(reinterpret_cast<uint32_t*>(dst + offset)) = *src;
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void WriteAqlArgAt(
|
||||
unsigned char* dst, //!< The write pointer to the buffer
|
||||
const uint64_t* src, //!< The source pointer
|
||||
uint size, //!< The size in bytes to copy
|
||||
size_t offset //!< The alignment to follow while writing to the buffer
|
||||
) {
|
||||
*(reinterpret_cast<uint64_t*>(dst + offset)) = *src;
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user