diff --git a/rocclr/runtime/device/rocm/rockernel.cpp b/rocclr/runtime/device/rocm/rockernel.cpp index 68e6d96944..47268bc612 100644 --- a/rocclr/runtime/device/rocm/rockernel.cpp +++ b/rocclr/runtime/device/rocm/rockernel.cpp @@ -231,6 +231,37 @@ static inline ROC_ADDRESS_QUALIFIER GetKernelAddrQual(const aclArgData* argInfo) return ROC_ADDRESS_ERROR; } +inline static uint32_t GetOclArgumentType(const HSAILKernel::Argument* arg) { + switch (arg->type_){ + case ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_X: + return amd::KernelParameterDescriptor::HiddenGlobalOffsetX; + case ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Y: + return amd::KernelParameterDescriptor::HiddenGlobalOffsetY; + case ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Z: + return amd::KernelParameterDescriptor::HiddenGlobalOffsetZ; + case ROC_ARGTYPE_HIDDEN_PRINTF_BUFFER: + return amd::KernelParameterDescriptor::HiddenPrintfBuffer; + case ROC_ARGTYPE_HIDDEN_DEFAULT_QUEUE: + return amd::KernelParameterDescriptor::HiddenDefaultQueue; + case ROC_ARGTYPE_HIDDEN_COMPLETION_ACTION: + return amd::KernelParameterDescriptor::HiddenCompletionAction; + case ROC_ARGTYPE_POINTER: + return amd::KernelParameterDescriptor::MemoryObject; + case ROC_ARGTYPE_IMAGE: + return amd::KernelParameterDescriptor::ImageObject; + case ROC_ARGTYPE_REFERENCE: + return amd::KernelParameterDescriptor::ReferenceObject; + case ROC_ARGTYPE_VALUE: + return amd::KernelParameterDescriptor::ValueObject; + case ROC_ARGTYPE_SAMPLER: + return amd::KernelParameterDescriptor::SamplerObject; + case ROC_ARGTYPE_QUEUE: + return amd::KernelParameterDescriptor::QueueObject; + default: + return amd::KernelParameterDescriptor::HiddenNone; + } +} + #if defined(WITH_LIGHTNING_COMPILER) static inline ROC_DATA_TYPE GetKernelDataType(const KernelArgMD& lcArg) { aclArgDataType dataType; @@ -514,6 +545,8 @@ static inline cl_kernel_arg_type_qualifier GetOclTypeQual(const aclArgData* argI #if defined(WITH_COMPILER_LIB) void HSAILKernel::initArguments(const aclArgData* aclArg) { device::Kernel::parameters_t params; + device::Kernel::parameters_t hiddenParams; + size_t offsetStruct = KernargSegmentByteSize(); // Iterate through the arguments and insert into parameterList for (size_t offset = 0; aclArg->struct_size != 0; aclArg++) { @@ -539,17 +572,27 @@ void HSAILKernel::initArguments(const aclArgData* aclArg) { arg->index_ = isHidden ? uint(-1) : params.size(); hsailArgList_.push_back(arg); + amd::KernelParameterDescriptor desc; + + // Allocate the hidden arguments, but abstraction layer will skip them if (isHidden) { + offset = amd::alignUp(offset, arg->alignment_); + desc.offset_ = offset; + desc.size_ = arg->size_; + offset += arg->size_; + desc.info_.oclObject_ = GetOclArgumentType(arg); + hiddenParams.push_back(desc); continue; } - amd::KernelParameterDescriptor desc; desc.name_ = arg->name_.c_str(); desc.type_ = GetOclType(arg); desc.addressQualifier_ = GetOclAddrQual(arg); desc.accessQualifier_ = GetOclAccessQual(arg); desc.typeQualifier_ = GetOclTypeQual(aclArg); desc.typeName_ = arg->typeName_.c_str(); + desc.info_.oclObject_ = GetOclArgumentType(arg); + desc.info_.arrayIndex_ = arg->pointeeAlignment_; // set image related flags if (arg->type_ == ROC_ARGTYPE_IMAGE) { @@ -566,19 +609,48 @@ void HSAILKernel::initArguments(const aclArgData* aclArg) { // and CPU sends the parameters as they are allocated in memory size_t size = desc.size_; - offset = amd::alignUp(offset, std::min(size, size_t(16))); - desc.offset_ = offset; - offset += amd::alignUp(size, sizeof(uint32_t)); + // Check if HSAIL expects data by reference and allocate it behind + if (arg->type_ == ROC_ARGTYPE_REFERENCE) { + desc.offset_ = offsetStruct; + // Align the offset reference + offset = amd::alignUp(offset, sizeof(size_t)); + patchReferences_.insert({desc.offset_, offset}); + offsetStruct += size; + // Adjust the offset of arguments + offset += sizeof(size_t); + } + else if ((desc.info_.oclObject_ == amd::KernelParameterDescriptor::ImageObject) || + (desc.info_.oclObject_ == amd::KernelParameterDescriptor::SamplerObject) || + (desc.info_.oclObject_ == amd::KernelParameterDescriptor::QueueObject)) { + // These objects have forced data size to uint64_t + offset = amd::alignUp(offset, sizeof(uint64_t)); + desc.offset_ = offset; + offset += sizeof(uint64_t); + } else { + offset = amd::alignUp(offset, arg->alignment_); + desc.offset_ = offset; + offset += size; + } + + // Update read only flag + desc.info_.readOnly_ = (arg->access_ == ROC_ACCESS_TYPE_RO) ? true : false; params.push_back(desc); } - createSignature(params, params.size(), amd::KernelSignature::ABIVersion_0); + + // Save the number of OCL arguments + uint32_t numParams = params.size(); + // Append the hidden arguments to the OCL arguments + params.insert(params.end(), hiddenParams.begin(), hiddenParams.end()); + createSignature(params, numParams, amd::KernelSignature::ABIVersion_1); } #endif // defined(WITH_COMPILER_LIB) #if defined(WITH_LIGHTNING_COMPILER) void LightningKernel::initArguments(const KernelMD& kernelMD) { device::Kernel::parameters_t params; + device::Kernel::parameters_t hiddenParams; + size_t offsetStruct = KernargSegmentByteSize(); size_t offset = 0; @@ -607,19 +679,27 @@ void LightningKernel::initArguments(const KernelMD& kernelMD) { arg->index_ = isHidden ? uint(-1) : params.size(); hsailArgList_.push_back(arg); - if (isHidden) { - continue; - } - // Initialize Device kernel parameters amd::KernelParameterDescriptor desc; + if (isHidden) { + offset = amd::alignUp(offset, arg->alignment_); + desc.offset_ = offset; + desc.size_ = arg->size_; + offset += arg->size_; + desc.info_.oclObject_ = GetOclArgumentType(arg); + hiddenParams.push_back(desc); + continue; + } + desc.name_ = lcArg.mName.c_str(); desc.type_ = GetOclType(arg); desc.addressQualifier_ = GetOclAddrQual(arg); desc.accessQualifier_ = GetOclAccessQual(arg); desc.typeQualifier_ = GetOclTypeQual(lcArg); desc.typeName_ = lcArg.mTypeName.c_str(); + desc.info_.oclObject_ = GetOclArgumentType(arg); + desc.info_.arrayIndex_ = arg->pointeeAlignment_; // set image related flags if (arg->type_ == ROC_ARGTYPE_IMAGE) { @@ -629,6 +709,7 @@ void LightningKernel::initArguments(const KernelMD& kernelMD) { flags_.imageWrite_ = true; } } + desc.size_ = arg->size_; // Make offset alignment to match CPU metadata, since @@ -636,13 +717,40 @@ void LightningKernel::initArguments(const KernelMD& kernelMD) { // and CPU sends the parameters as they are allocated in memory size_t size = desc.size_; - offset = (size_t)amd::alignUp(offset, std::min(size, size_t(16))); - desc.offset_ = offset; - offset += amd::alignUp(size, sizeof(uint32_t)); + // Check if HSAIL expects data by reference and allocate it behind + if (arg->type_ == ROC_ARGTYPE_REFERENCE) { + desc.offset_ = offsetStruct; + // Align the offset reference + offset = amd::alignUp(offset, sizeof(size_t)); + patchReferences_.insert({desc.offset_, offset}); + offsetStruct += size; + // Adjust the offset of arguments + offset += sizeof(size_t); + } + else if ((desc.info_.oclObject_ == amd::KernelParameterDescriptor::ImageObject) || + (desc.info_.oclObject_ == amd::KernelParameterDescriptor::SamplerObject) || + (desc.info_.oclObject_ == amd::KernelParameterDescriptor::QueueObject)) { + // These objects have forced data size to uint64_t + offset = amd::alignUp(offset, sizeof(uint64_t)); + desc.offset_ = offset; + offset += sizeof(uint64_t); + } else { + offset = amd::alignUp(offset, arg->alignment_); + desc.offset_ = offset; + offset += size; + } + + // Update read only flag + desc.info_.readOnly_ = (arg->access_ == ROC_ACCESS_TYPE_RO) ? true : false; params.push_back(desc); } - createSignature(params, params.size(), amd::KernelSignature::ABIVersion_0); + + // Save the number of OCL arguments + uint32_t numParams = params.size(); + // Append the hidden arguments to the OCL arguments + params.insert(params.end(), hiddenParams.begin(), hiddenParams.end()); + createSignature(params, numParams, amd::KernelSignature::ABIVersion_1); } #endif // defined(WITH_LIGHTNING_COMPILER) diff --git a/rocclr/runtime/device/rocm/rockernel.hpp b/rocclr/runtime/device/rocm/rockernel.hpp index f0b8690e71..0c1c0f7e18 100644 --- a/rocclr/runtime/device/rocm/rockernel.hpp +++ b/rocclr/runtime/device/rocm/rockernel.hpp @@ -140,6 +140,8 @@ class Kernel : public device::Kernel { //! Return TRUE if kernel wirtes images bool imageWrite() const { return (flags_.imageWrite_) ? true : false; } + const std::unordered_map& patch() const { return patchReferences_; } + protected: union Flags { struct { @@ -162,6 +164,7 @@ class Kernel : public device::Kernel { const uint32_t kernargSegmentAlignment_; size_t kernelDirectiveOffset_; std::vector printf_; + std::unordered_map patchReferences_; //!< Patch table for references }; #if defined(WITH_COMPILER_LIB) diff --git a/rocclr/runtime/device/rocm/rocsettings.cpp b/rocclr/runtime/device/rocm/rocsettings.cpp index a768801d3b..6c6c7d71da 100644 --- a/rocclr/runtime/device/rocm/rocsettings.cpp +++ b/rocclr/runtime/device/rocm/rocsettings.cpp @@ -20,7 +20,6 @@ Settings::Settings() { pollCompletion_ = ENVVAR_HSA_POLL_KERNEL_COMPLETION; enableLocalMemory_ = HSA_LOCAL_MEMORY_ENABLE; - enableImageHandle_ = true; maxWorkGroupSize_ = 1024; preferredWorkGroupSize_ = 256; diff --git a/rocclr/runtime/device/rocm/rocsettings.hpp b/rocclr/runtime/device/rocm/rocsettings.hpp index 1ecd636d2a..d3c601de4c 100644 --- a/rocclr/runtime/device/rocm/rocsettings.hpp +++ b/rocclr/runtime/device/rocm/rocsettings.hpp @@ -22,7 +22,6 @@ class Settings : public device::Settings { uint doublePrecision_ : 1; //!< Enables double precision support uint pollCompletion_ : 1; //!< Enables polling in HSA uint enableLocalMemory_ : 1; //!< Enable GPUVM memory - uint enableImageHandle_ : 1; //!< Use HSAIL image/sampler pointer uint enableNCMode_ : 1; //!< Enable Non Coherent mode for system memory uint enablePartialDispatch_ : 1; //!< Enable support for Partial Dispatch uint imageDMA_ : 1; //!< Enable direct image DMA transfers diff --git a/rocclr/runtime/device/rocm/rocvirtual.cpp b/rocclr/runtime/device/rocm/rocvirtual.cpp index 41c95d9995..9d9cdecd34 100644 --- a/rocclr/runtime/device/rocm/rocvirtual.cpp +++ b/rocclr/runtime/device/rocm/rocvirtual.cpp @@ -185,8 +185,37 @@ void VirtualGPU::MemoryDependency::clear(bool all) { } } -bool VirtualGPU::processMemObjects(const amd::Kernel& kernel, const_address params) { - const Kernel& hsaKernel = static_cast(*(kernel.getDeviceKernel(dev()))); +static void fillSampleDescriptor(hsa_ext_sampler_descriptor_t& samplerDescriptor, + const amd::Sampler& sampler) { + samplerDescriptor.filter_mode = sampler.filterMode() == CL_FILTER_NEAREST + ? HSA_EXT_SAMPLER_FILTER_MODE_NEAREST + : HSA_EXT_SAMPLER_FILTER_MODE_LINEAR; + samplerDescriptor.coordinate_mode = sampler.normalizedCoords() + ? HSA_EXT_SAMPLER_COORDINATE_MODE_NORMALIZED + : HSA_EXT_SAMPLER_COORDINATE_MODE_UNNORMALIZED; + switch (sampler.addressingMode()) { + case CL_ADDRESS_CLAMP_TO_EDGE: + samplerDescriptor.address_mode = HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE; + break; + case CL_ADDRESS_REPEAT: + samplerDescriptor.address_mode = HSA_EXT_SAMPLER_ADDRESSING_MODE_REPEAT; + break; + case CL_ADDRESS_CLAMP: + samplerDescriptor.address_mode = HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_BORDER; + break; + case CL_ADDRESS_MIRRORED_REPEAT: + samplerDescriptor.address_mode = HSA_EXT_SAMPLER_ADDRESSING_MODE_MIRRORED_REPEAT; + break; + case CL_ADDRESS_NONE: + samplerDescriptor.address_mode = HSA_EXT_SAMPLER_ADDRESSING_MODE_UNDEFINED; + break; + default: + return; + } +} + +bool VirtualGPU::processMemObjects(const amd::Kernel& kernel, const_address params, size_t& ldsAddress) { + Kernel& hsaKernel = const_cast(static_cast(*(kernel.getDeviceKernel(dev())))); const amd::KernelSignature& signature = kernel.signature(); const amd::KernelParameters& kernelParams = kernel.parameters(); @@ -256,38 +285,141 @@ bool VirtualGPU::processMemObjects(const amd::Kernel& kernel, const_address para // Check all parameters for the current kernel for (size_t i = 0; i < signature.numParameters(); ++i) { const amd::KernelParameterDescriptor& desc = signature.at(i); - const Kernel::Argument* arg = hsaKernel.hsailArgAt(i); Memory* gpuMem = nullptr; - bool readOnly = false; amd::Memory* mem = nullptr; // Find if current argument is a buffer - if ((desc.type_ == T_POINTER) && (arg->addrQual_ != ROC_ADDRESS_LOCAL)) { - uint32_t index = desc.info_.arrayIndex_; - mem = memories[index]; - if (mem != nullptr) { - gpuMem = static_cast(mem->getDeviceMemory(dev())); - // Don't sync for internal objects, - // since they are not shared between devices - if (gpuMem->owner()->getVirtualDevice() == nullptr) { - // Synchronize data with other memory instances if necessary - gpuMem->syncCacheFromHost(*this); + if (desc.type_ == T_POINTER) { + if (desc.addressQualifier_ == CL_KERNEL_ARG_ADDRESS_LOCAL) { + // Align the LDS on the alignment requirement of type pointed to + ldsAddress = amd::alignUp(ldsAddress, desc.info_.arrayIndex_); + if (desc.size_ == 8) { + // Save the original LDS size + uint64_t ldsSize = *reinterpret_cast(params + desc.offset_); + // Patch the LDS address in the original arguments with an LDS address(offset) + WriteAqlArgAt(const_cast
(params), &ldsAddress, desc.size_, desc.offset_); + // Add the original size + ldsAddress += ldsSize; + } else { + // Save the original LDS size + uint32_t ldsSize = *reinterpret_cast(params + desc.offset_); + // Patch the LDS address in the original arguments with an LDS address(offset) + uint32_t ldsAddr = ldsAddress; + WriteAqlArgAt(const_cast
(params), &ldsAddr, desc.size_, desc.offset_); + // Add the original size + ldsAddress += ldsSize; } } - //! This condition is for SVM fine-grain - if ((gpuMem == nullptr) && dev().isFineGrainedSystem(true)) { - // Sync AQL packets - setAqlHeader(kDispatchPacketHeader); - // Clear memory dependency state - const static bool All = true; - memoryDependency().clear(!All); - continue; - } else if (gpuMem != nullptr) { - readOnly |= (arg->access_ == ROC_ACCESS_TYPE_RO); - // Validate memory for a dependency in the queue - memoryDependency().validate(*this, gpuMem, readOnly); + else { + uint32_t index = desc.info_.arrayIndex_; + mem = memories[index]; + if (mem == nullptr) { + //! This condition is for SVM fine-grain + if (dev().isFineGrainedSystem(true)) { + // Sync AQL packets + setAqlHeader(kDispatchPacketHeader); + // Clear memory dependency state + const static bool All = true; + memoryDependency().clear(!All); + } + } + else { + gpuMem = static_cast(mem->getDeviceMemory(dev())); + // Don't sync for internal objects, + // since they are not shared between devices + if (gpuMem->owner()->getVirtualDevice() == nullptr) { + // Synchronize data with other memory instances if necessary + gpuMem->syncCacheFromHost(*this); + } + + // Validate memory for a dependency in the queue + memoryDependency().validate(*this, gpuMem, (desc.info_.readOnly_ == 1)); + + assert((desc.addressQualifier_ == CL_KERNEL_ARG_ADDRESS_GLOBAL || + desc.addressQualifier_ == CL_KERNEL_ARG_ADDRESS_CONSTANT) && + "Unsupported address qualifier"); + + const bool readOnly = +#if defined(WITH_LIGHTNING_COMPILER) + desc.typeQualifier_ == CL_KERNEL_ARG_TYPE_CONST || +#endif // defined(WITH_LIGHTNING_COMPILER) + (mem->getMemFlags() & CL_MEM_READ_ONLY) != 0; + + if (!readOnly) { + mem->signalWrite(&dev()); + } + + if (desc.info_.oclObject_ == amd::KernelParameterDescriptor::ImageObject) { + Image* image = static_cast(mem->getDeviceMemory(dev())); + + const uint64_t image_srd = image->getHsaImageObject().handle; + assert(amd::isMultipleOf(image_srd, sizeof(image_srd))); + WriteAqlArgAt(const_cast
(params), &image_srd, sizeof(image_srd), desc.offset_); + } + } } } + else if (desc.type_ == T_QUEUE) { + uint32_t index = desc.info_.arrayIndex_; + const amd::DeviceQueue* queue = reinterpret_cast( + params + kernelParams.queueObjOffset())[index]; + + if (!createVirtualQueue(queue->size()) || !createSchedulerParam()) { + return false; + } + hsaKernel.setDynamicParallelFlag(true); + uint64_t vqVA = getVQVirtualAddress(); + WriteAqlArgAt(const_cast
(params), &vqVA, sizeof(vqVA), desc.offset_); + } + else if (desc.type_ == T_VOID) { + if (desc.info_.oclObject_ == amd::KernelParameterDescriptor::ReferenceObject) { + const_address srcArgPtr = params + desc.offset_; + void* mem = allocKernArg(desc.size_, 128); + if (mem == nullptr) { + LogError("Out of memory"); + return false; + } + memcpy(mem, srcArgPtr, desc.size_); + const auto it = hsaKernel.patch().find(desc.offset_); + WriteAqlArgAt(const_cast
(params), &mem, sizeof(void*), it->second); + } + } + else if (desc.type_ == T_SAMPLER) { + uint32_t index = desc.info_.arrayIndex_; + const amd::Sampler* sampler = reinterpret_cast(params + + kernelParams.samplerObjOffset())[index]; + + hsa_ext_sampler_descriptor_t samplerDescriptor; + fillSampleDescriptor(samplerDescriptor, *sampler); + + hsa_ext_sampler_t hsa_sampler; + hsa_status_t status = + hsa_ext_sampler_create(dev().getBackendDevice(), &samplerDescriptor, &hsa_sampler); + + if (status != HSA_STATUS_SUCCESS) { + // Wait on a kernel if one is outstanding + releaseGpuMemoryFence(); + // Release the sampler handles allocated for the various + // on one or more kernel submissions + for (const auto& it: samplerList_) { + if (hsa_ext_sampler_destroy(gpu_device_, it) != HSA_STATUS_SUCCESS) { + LogWarning("Error destroying device sampler object!"); + } + } + + samplerList_.clear(); + status = hsa_ext_sampler_create(dev().getBackendDevice(), &samplerDescriptor, &hsa_sampler); + if (status != HSA_STATUS_SUCCESS) { + LogError("Error creating device sampler object!"); + return false; + } + } + + uint64_t sampler_srd = hsa_sampler.handle; + WriteAqlArgAt(const_cast
(params), &sampler_srd, sizeof(sampler_srd), desc.offset_); + samplerList_.push_back(hsa_sampler); + // TODO: destroy sampler. + } } if (hsaKernel.program()->hasGlobalStores()) { @@ -1438,26 +1570,6 @@ void VirtualGPU::submitMigrateMemObjects(amd::MigrateMemObjectsCommand& vcmd) { profilingEnd(vcmd); } -/*! \brief Writes to the buffer and increments the write pointer to the - * buffer. Also, ensures that the argument is written to an - * aligned memory as specified. Return the new write pointer. - * - * @param dst The write pointer to the buffer - * @param src The source pointer - * @param size The size in bytes to copy - * @param alignment The alignment to follow while writing to the buffer - */ -static inline address addArg(address dst, const void* src, size_t size, uint32_t alignment) { - dst = amd::alignUp(dst, alignment); - ::memcpy(dst, src, size); - return dst + size; -} - -static inline address addArg(address dst, const void* src, size_t size) { - assert(size < UINT32_MAX); - return addArg(dst, src, size, size); -} - // Over rides the workgroup size fields in the packet with runtime/compiler set sizes void setRuntimeCompilerLocalSize(hsa_kernel_dispatch_packet_t& dispatchPacket, amd::NDRangeContainer sizes, device::Kernel* devKernel, @@ -1584,35 +1696,6 @@ void setRuntimeCompilerLocalSize(hsa_kernel_dispatch_packet_t& dispatchPacket, } } -static void fillSampleDescriptor(hsa_ext_sampler_descriptor_t& samplerDescriptor, - const amd::Sampler& sampler) { - samplerDescriptor.filter_mode = sampler.filterMode() == CL_FILTER_NEAREST - ? HSA_EXT_SAMPLER_FILTER_MODE_NEAREST - : HSA_EXT_SAMPLER_FILTER_MODE_LINEAR; - samplerDescriptor.coordinate_mode = sampler.normalizedCoords() - ? HSA_EXT_SAMPLER_COORDINATE_MODE_NORMALIZED - : HSA_EXT_SAMPLER_COORDINATE_MODE_UNNORMALIZED; - switch (sampler.addressingMode()) { - case CL_ADDRESS_CLAMP_TO_EDGE: - samplerDescriptor.address_mode = HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE; - break; - case CL_ADDRESS_REPEAT: - samplerDescriptor.address_mode = HSA_EXT_SAMPLER_ADDRESSING_MODE_REPEAT; - break; - case CL_ADDRESS_CLAMP: - samplerDescriptor.address_mode = HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_BORDER; - break; - case CL_ADDRESS_MIRRORED_REPEAT: - samplerDescriptor.address_mode = HSA_EXT_SAMPLER_ADDRESSING_MODE_MIRRORED_REPEAT; - break; - case CL_ADDRESS_NONE: - samplerDescriptor.address_mode = HSA_EXT_SAMPLER_ADDRESSING_MODE_UNDEFINED; - break; - default: - return; - } -} - bool VirtualGPU::createSchedulerParam() { if (nullptr != schedulerParam_) { @@ -1797,12 +1880,10 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const const_address parameters, void* eventHandle) { device::Kernel* devKernel = const_cast(kernel.getDeviceKernel(dev())); Kernel& gpuKernel = static_cast(*devKernel); - - const size_t compilerLdsUsage = gpuKernel.WorkgroupGroupSegmentByteSize(); - size_t ldsUsage = compilerLdsUsage; + size_t ldsUsage = gpuKernel.WorkgroupGroupSegmentByteSize(); // Check memory dependency and SVM objects - if (!processMemObjects(kernel, parameters)) { + if (!processMemObjects(kernel, parameters, ldsUsage)) { LogError("Wrong memory objects!"); return false; } @@ -1868,58 +1949,46 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const return false; } - address argPtr = argBuffer; - for (auto arg : gpuKernel.hsailArgs()) { - const_address srcArgPtr = nullptr; - if (arg->index_ != uint(-1)) { - srcArgPtr = parameters + signature.at(arg->index_).offset_; - } - - // Handle the hidden arguments first, as they do not have a - // matching parameter in the OCL signature (not a valid arg->index_) - switch (arg->type_) { - case ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_X: { - size_t offset_x = sizes.dimensions() >= 1 ? newOffset[0] : 0; - assert(arg->size_ == sizeof(offset_x) && "check the sizes"); - argPtr = addArg(argPtr, &offset_x, arg->size_, arg->alignment_); + // Check if runtime has to setup hidden arguments + for (uint32_t i = signature.numParameters(); i < signature.numParametersAll(); ++i) { + const auto it = signature.at(i); + size_t offset; + switch (it.info_.oclObject_) { + case amd::KernelParameterDescriptor::HiddenNone: + break; + case amd::KernelParameterDescriptor::HiddenGlobalOffsetX: { + offset = newOffset[0]; + assert(it.size_ == sizeof(offset) && "check the sizes"); + WriteAqlArgAt(const_cast
(parameters), &offset, it.size_, it.offset_); break; } - case ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Y: { - size_t offset_y = sizes.dimensions() >= 2 ? newOffset[1] : 0; - assert(arg->size_ == sizeof(offset_y) && "check the sizes"); - argPtr = addArg(argPtr, &offset_y, arg->size_, arg->alignment_); + case amd::KernelParameterDescriptor::HiddenGlobalOffsetY: { + if (sizes.dimensions() >= 2) { + offset = newOffset[1]; + assert(it.size_ == sizeof(offset) && "check the sizes"); + WriteAqlArgAt(const_cast
(parameters), &offset, it.size_, it.offset_); + } break; } - case ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Z: { - size_t offset_z = sizes.dimensions() == 3 ? newOffset[2] : 0; - assert(arg->size_ == sizeof(offset_z) && "check the sizes"); - argPtr = addArg(argPtr, &offset_z, arg->size_, arg->alignment_); + case amd::KernelParameterDescriptor::HiddenGlobalOffsetZ: { + if (sizes.dimensions() >= 3) { + offset = newOffset[2]; + assert(it.size_ == sizeof(offset) && "check the sizes"); + WriteAqlArgAt(const_cast
(parameters), &offset, it.size_, it.offset_); + } break; } - case ROC_ARGTYPE_HIDDEN_PRINTF_BUFFER: { + case amd::KernelParameterDescriptor::HiddenPrintfBuffer: { address bufferPtr = printfDbg()->dbgBuffer(); - assert(arg->size_ == sizeof(bufferPtr) && "check the sizes"); - argPtr = addArg(argPtr, &bufferPtr, arg->size_, arg->alignment_); + if (printfEnabled && + // and printf buffer was allocated + (bufferPtr != nullptr)) { + assert(it.size_ == sizeof(bufferPtr) && "check the sizes"); + WriteAqlArgAt(const_cast
(parameters), &bufferPtr, it.size_, it.offset_); + } break; } - case ROC_ARGTYPE_QUEUE: { - uint32_t index = signature.at(arg->index_).info_.arrayIndex_; - const amd::DeviceQueue* queue = reinterpret_cast(parameters + - kernelParams.samplerObjOffset())[index]; - if (queue == nullptr) { - return false; - } - - if (!createVirtualQueue(queue->size()) || !createSchedulerParam()) { - return false; - } - gpuKernel.setDynamicParallelFlag(true); - uint64_t vqVA = getVQVirtualAddress(); - argPtr = addArg(argPtr, &vqVA, arg->size_, arg->alignment_); - break; - } - case ROC_ARGTYPE_HIDDEN_DEFAULT_QUEUE: { - + case amd::KernelParameterDescriptor::HiddenDefaultQueue: { amd::DeviceQueue* defQueue = kernel.program().context().defDeviceQueue(dev()); if (!createVirtualQueue(defQueue->size()) || !createSchedulerParam()) { @@ -1927,156 +1996,28 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const } gpuKernel.setDynamicParallelFlag(true); uint64_t vqVA = getVQVirtualAddress(); - argPtr = addArg(argPtr, &vqVA, arg->size_, arg->alignment_); + WriteAqlArgAt(const_cast
(parameters), &vqVA, it.size_, it.offset_); break; } - case ROC_ARGTYPE_HIDDEN_COMPLETION_ACTION: { - + case amd::KernelParameterDescriptor::HiddenCompletionAction: { Memory* schedulerMem = dev().getRocMemory(schedulerParam_); AmdAqlWrap* wrap = reinterpret_cast(reinterpret_cast(schedulerParam_->getHostMem()) + sizeof(SchedulerParam)); memset(wrap, 0, sizeof(AmdAqlWrap)); wrap->state = AQL_WRAP_DONE; uint64_t spVA = reinterpret_cast(schedulerMem->getDeviceMemory()) + sizeof(SchedulerParam); - argPtr = addArg(argPtr, &spVA, arg->size_, arg->alignment_); + WriteAqlArgAt(const_cast
(parameters), &spVA, it.size_, it.offset_); break; } - case ROC_ARGTYPE_HIDDEN_NONE: { - void* zero = 0; - assert(arg->size_ <= sizeof(zero) && "check the sizes"); - argPtr = addArg(argPtr, &zero, arg->size_, arg->alignment_); - break; - } - case ROC_ARGTYPE_POINTER: { - if (arg->addrQual_ == ROC_ADDRESS_LOCAL) { - // Align the LDS on the alignment requirement of type pointed to - ldsUsage = amd::alignUp(ldsUsage, arg->pointeeAlignment_); - argPtr = addArg(argPtr, &ldsUsage, arg->size_, arg->alignment_); - if (sizeof(uint64_t) == arg->size_) { - ldsUsage += *reinterpret_cast(srcArgPtr); - } else { - ldsUsage += *reinterpret_cast(srcArgPtr); - } - break; - } - assert((arg->addrQual_ == ROC_ADDRESS_GLOBAL || arg->addrQual_ == ROC_ADDRESS_CONSTANT) && - "Unsupported address qualifier"); - argPtr = addArg(argPtr, srcArgPtr, arg->size_, arg->alignment_); - uint32_t index = signature.at(arg->index_).info_.arrayIndex_; - amd::Memory* mem = memories[index]; - if (mem == nullptr) { - break; - } - - const bool readOnly = -#if defined(WITH_LIGHTNING_COMPILER) - signature.at(arg->index_).typeQualifier_ == CL_KERNEL_ARG_TYPE_CONST || -#endif // defined(WITH_LIGHTNING_COMPILER) - (mem->getMemFlags() & CL_MEM_READ_ONLY) != 0; - - if (!readOnly) { - mem->signalWrite(&dev()); - } - break; - } - case ROC_ARGTYPE_REFERENCE: { - void* mem = allocKernArg(arg->size_, arg->alignment_); - if (mem == nullptr) { - LogError("Out of memory"); - return false; - } - memcpy(mem, srcArgPtr, arg->size_); - argPtr = addArg(argPtr, &mem, sizeof(void*)); - break; - } - case ROC_ARGTYPE_VALUE: - argPtr = addArg(argPtr, srcArgPtr, arg->size_, arg->alignment_); - break; - case ROC_ARGTYPE_IMAGE: { - uint32_t index = signature.at(arg->index_).info_.arrayIndex_; - amd::Memory* mem = memories[index]; - Image* image = static_cast(mem->getDeviceMemory(dev())); - if (image == nullptr) { - LogError("Kernel image argument is not an image object"); - return false; - } - - if (dev().settings().enableImageHandle_) { - const uint64_t image_srd = image->getHsaImageObject().handle; - assert(amd::isMultipleOf(image_srd, sizeof(image_srd))); - argPtr = addArg(argPtr, &image_srd, sizeof(image_srd)); - } else { - // Image arguments are of size 48 bytes and are aligned to 16 bytes - argPtr = addArg(argPtr, (void*)image->getHsaImageObject().handle, HSA_IMAGE_OBJECT_SIZE, - HSA_IMAGE_OBJECT_ALIGNMENT); - } - - const bool readOnly = -#if defined(WITH_LIGHTNING_COMPILER) - signature.at(arg->index_).accessQualifier_ == CL_KERNEL_ARG_ACCESS_READ_ONLY || -#endif // defined(WITH_LIGHTNING_COMPILER) - mem->getMemFlags() & CL_MEM_READ_ONLY; - - if (!readOnly) { - mem->signalWrite(&dev()); - } - break; - } - case ROC_ARGTYPE_SAMPLER: { - uint32_t index = signature.at(arg->index_).info_.arrayIndex_; - const amd::Sampler* sampler = reinterpret_cast(parameters + - kernelParams.samplerObjOffset())[index]; - if (sampler == nullptr) { - LogError("Kernel sampler argument is not an sampler object"); - return false; - } - - hsa_ext_sampler_descriptor_t samplerDescriptor; - fillSampleDescriptor(samplerDescriptor, *sampler); - - hsa_ext_sampler_t hsa_sampler; - hsa_status_t status = - hsa_ext_sampler_create(dev().getBackendDevice(), &samplerDescriptor, &hsa_sampler); - if (status != HSA_STATUS_SUCCESS) { - // Wait on a kernel if one is outstanding - releaseGpuMemoryFence(); - // Release the sampler handles allocated for the various - // on one or more kernel submissions - for (const auto& it: samplerList_) { - if (hsa_ext_sampler_destroy(gpu_device_, it) != HSA_STATUS_SUCCESS) { - LogWarning("Error destroying device sampler object!"); - } - } - samplerList_.clear(); - - status = hsa_ext_sampler_create(dev().getBackendDevice(), &samplerDescriptor, &hsa_sampler); - if (status != HSA_STATUS_SUCCESS) { - LogError("Error creating device sampler object!"); - return false; - } - } - - if (dev().settings().enableImageHandle_) { - uint64_t sampler_srd = hsa_sampler.handle; - argPtr = addArg(argPtr, &sampler_srd, sizeof(sampler_srd)); - samplerList_.push_back(hsa_sampler); - // TODO: destroy sampler. - } else { - argPtr = amd::alignUp(argPtr, HSA_SAMPLER_OBJECT_ALIGNMENT); - - memcpy(argPtr, (void*)hsa_sampler.handle, HSA_SAMPLER_OBJECT_SIZE); - argPtr += HSA_SAMPLER_OBJECT_SIZE; - hsa_ext_sampler_destroy(dev().getBackendDevice(), hsa_sampler); - } - break; - } - default: - return false; } } - // Check there is no arguments' buffer overflow - assert(argPtr <= argBuffer + gpuKernel.KernargSegmentByteSize()); + // Load all kernel arguments + WriteAqlArgAt(argBuffer, parameters, gpuKernel.KernargSegmentByteSize(), 0); + // Note: In a case of structs the size won't match, + // since HSAIL compiler expects a reference... + assert(gpuKernel.KernargSegmentByteSize() <= signature.paramsSize() && + "A mismatch of sizes of arguments between compiler and runtime!"); // Check for group memory overflow //! @todo Check should be in HSA - here we should have at most an assert diff --git a/rocclr/runtime/device/rocm/rocvirtual.hpp b/rocclr/runtime/device/rocm/rocvirtual.hpp index 520cc9f515..40758f8fd5 100644 --- a/rocclr/runtime/device/rocm/rocvirtual.hpp +++ b/rocclr/runtime/device/rocm/rocvirtual.hpp @@ -217,7 +217,8 @@ class VirtualGPU : public device::VirtualDevice { //! Detects memory dependency for HSAIL kernels and uses appropriate AQL header bool processMemObjects(const amd::Kernel& kernel, //!< AMD kernel object for execution - const_address params //!< Pointer to the param's store + const_address params, //!< Pointer to the param's store + size_t& ldsAddress //!< LDS usage ); // Retun the virtual gpu unique index uint index() const { return index_; } @@ -313,4 +314,34 @@ class VirtualGPU : public device::VirtualDevice { }; }; + +template +inline void WriteAqlArgAt( + unsigned char* dst, //!< The write pointer to the buffer + const T* src, //!< The source pointer + uint size, //!< The size in bytes to copy + size_t offset //!< The alignment to follow while writing to the buffer +) { + memcpy(dst + offset, src, size); +} + +template <> +inline void WriteAqlArgAt( + unsigned char* dst, //!< The write pointer to the buffer + const uint32_t* src, //!< The source pointer + uint size, //!< The size in bytes to copy + size_t offset //!< The alignment to follow while writing to the buffer +) { + *(reinterpret_cast(dst + offset)) = *src; +} + +template <> +inline void WriteAqlArgAt( + unsigned char* dst, //!< The write pointer to the buffer + const uint64_t* src, //!< The source pointer + uint size, //!< The size in bytes to copy + size_t offset //!< The alignment to follow while writing to the buffer +) { + *(reinterpret_cast(dst + offset)) = *src; +} }