diff --git a/projects/clr/rocclr/runtime/device/device.cpp b/projects/clr/rocclr/runtime/device/device.cpp index 0be2916483..718afc078a 100644 --- a/projects/clr/rocclr/runtime/device/device.cpp +++ b/projects/clr/rocclr/runtime/device/device.cpp @@ -600,7 +600,9 @@ Settings::Settings() { //!< concurrent Virtual GPUs for default } -bool Kernel::createSignature(const parameters_t& params) { +bool Kernel::createSignature( + const parameters_t& params, const parameters_t& hiddenParams, + uint32_t version) { std::stringstream attribs; if (workGroupInfo_.compileSize_[0] != 0) { attribs << "reqd_work_group_size("; @@ -632,7 +634,7 @@ bool Kernel::createSignature(const parameters_t& params) { // Destroy old signature if it was allocated before // (offline devices path) delete signature_; - signature_ = new amd::KernelSignature(params, attribs.str()); + signature_ = new amd::KernelSignature(params, attribs.str(), hiddenParams, version); if (NULL != signature_) { return true; } diff --git a/projects/clr/rocclr/runtime/device/device.hpp b/projects/clr/rocclr/runtime/device/device.hpp index 1b3d33f25a..4c32821207 100644 --- a/projects/clr/rocclr/runtime/device/device.hpp +++ b/projects/clr/rocclr/runtime/device/device.hpp @@ -852,7 +852,9 @@ class Kernel : public amd::HeapObject { const std::string& name() const { return name_; } //! Initializes the kernel parameters for the abstraction layer - bool createSignature(const parameters_t& params); + bool createSignature( + const parameters_t& params, const parameters_t& hiddenParams, + uint32_t version); //! Returns TRUE if it's a HSA kernel bool hsa() const { return hsa_; } @@ -1624,6 +1626,22 @@ class Device : public RuntimeObject { }; struct KernelParameterDescriptor { + enum { + Value = 0, + HiddenNone = 1, + HiddenGlobalOffsetX = 2, + HiddenGlobalOffsetY = 3, + HiddenGlobalOffsetZ = 4, + HiddenPrintfBuffer = 5, + HiddenDefaultQueue = 6, + HiddenCompletionAction = 7, + MemoryObject = 8, + ReferenceObject = 9, + ValueObject = 10, + ImageObject = 11, + SamplerObject = 12, + QueueObject = 13 + }; const char* name_; //!< The parameter's name in the source clk_value_type_t type_; //!< The parameter's type size_t offset_; //!< Its offset in the parameter's stack @@ -1642,7 +1660,7 @@ struct KernelParameterDescriptor { uint32_t rawPointer_ : 1; //!< Arguments have a raw GPU VA uint32_t defined_ : 1; //!< The argument was defined by the app uint32_t reserved_ : 1; //!< reserved - uint32_t arrayIndex_ : 28; //!< Index in the objects array + uint32_t arrayIndex_ : 24; //!< Index in the objects array or LDS alignment }; uint32_t allValues_; InfoData() : allValues_(0) {} diff --git a/projects/clr/rocclr/runtime/device/gpu/gpukernel.cpp b/projects/clr/rocclr/runtime/device/gpu/gpukernel.cpp index e49fe8b63b..09c911022e 100644 --- a/projects/clr/rocclr/runtime/device/gpu/gpukernel.cpp +++ b/projects/clr/rocclr/runtime/device/gpu/gpukernel.cpp @@ -752,7 +752,8 @@ bool NullKernel::create(const std::string& code, const std::string& metadata, workGroupInfo_.usedStackSize_ = calFuncInfo.stackSizeUsed; device::Kernel::parameters_t params; - if (!createSignature(params)) { + device::Kernel::parameters_t hiddenParams; + if (!createSignature(params, hiddenParams, amd::KernelSignature::ABIVersion_0)) { return false; } @@ -1337,7 +1338,8 @@ bool Kernel::initParameters() { workGroupInfo_.localMemSize_ = hwLocalSize_; } - if (!createSignature(params)) { + device::Kernel::parameters_t hiddenParams; + if (!createSignature(params, hiddenParams, amd::KernelSignature::ABIVersion_0)) { return false; } @@ -3017,7 +3019,8 @@ void HSAILKernel::initArgList(const aclArgData* aclArg) { } } - createSignature(params); + device::Kernel::parameters_t hiddenParams; + createSignature(params, hiddenParams, amd::KernelSignature::ABIVersion_0); } void HSAILKernel::initHsailArgs(const aclArgData* aclArg) { diff --git a/projects/clr/rocclr/runtime/device/pal/palblit.cpp b/projects/clr/rocclr/runtime/device/pal/palblit.cpp index b8845ac7f6..f64af9d2cf 100644 --- a/projects/clr/rocclr/runtime/device/pal/palblit.cpp +++ b/projects/clr/rocclr/runtime/device/pal/palblit.cpp @@ -943,24 +943,30 @@ static void setArgument(amd::Kernel* kernel, size_t index, size_t size, const vo uint32_t uint32_value = 0; uint64_t uint64_value = 0; + size_t argSize = desc.size_; if (desc.type_ == T_POINTER && desc.size_ != 0) { if ((value == NULL) || (static_cast(value) == NULL)) { - LP64_SWITCH(uint32_value, uint64_value) = 0; reinterpret_cast(kernel->parameters().values() + kernel->parameters().memoryObjOffset())[desc.info_.arrayIndex_] = nullptr; } else { // convert cl_mem to amd::Memory*, return false if invalid. LP64_SWITCH(uint32_value, uint64_value) = static_cast(( - *static_cast(value))->vmAddress()); + *static_cast(value))->virtualAddress()); reinterpret_cast(kernel->parameters().values() + kernel->parameters().memoryObjOffset())[desc.info_.arrayIndex_] = *static_cast(value); + // Note: Special case for image SRD, which is 64 bit always + if (LP64_SWITCH(true, false) && + (desc.info_.oclObject_ == amd::KernelParameterDescriptor::ImageObject)) { + uint64_value = uint32_value; + argSize = sizeof(uint64_t); + } } } else if (desc.type_ == T_SAMPLER) { assert(false && "No sampler support in blit manager! Use internal samplers!"); } else - switch (desc.size_) { + switch (argSize) { case 1: uint32_value = *static_cast(value); break; @@ -977,7 +983,7 @@ static void setArgument(amd::Kernel* kernel, size_t index, size_t size, const vo break; } - switch (desc.size_) { + switch (argSize) { case 0 /*local mem*/: *static_cast(param) = size; break; diff --git a/projects/clr/rocclr/runtime/device/pal/palkernel.cpp b/projects/clr/rocclr/runtime/device/pal/palkernel.cpp index c2a2be0c01..f181218413 100644 --- a/projects/clr/rocclr/runtime/device/pal/palkernel.cpp +++ b/projects/clr/rocclr/runtime/device/pal/palkernel.cpp @@ -228,6 +228,37 @@ inline static int GetHSAILArgSize(const aclArgData* argInfo) { } } +inline static uint32_t GetOclArgumentType(const HSAILKernel::Argument* arg) { + switch (arg->type_){ + case HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_X: + return amd::KernelParameterDescriptor::HiddenGlobalOffsetX; + case HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Y: + return amd::KernelParameterDescriptor::HiddenGlobalOffsetY; + case HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Z: + return amd::KernelParameterDescriptor::HiddenGlobalOffsetZ; + case HSAIL_ARGTYPE_HIDDEN_PRINTF_BUFFER: + return amd::KernelParameterDescriptor::HiddenPrintfBuffer; + case HSAIL_ARGTYPE_HIDDEN_DEFAULT_QUEUE: + return amd::KernelParameterDescriptor::HiddenDefaultQueue; + case HSAIL_ARGTYPE_HIDDEN_COMPLETION_ACTION: + return amd::KernelParameterDescriptor::HiddenCompletionAction; + case HSAIL_ARGTYPE_POINTER: + return amd::KernelParameterDescriptor::MemoryObject; + case HSAIL_ARGTYPE_IMAGE: + return amd::KernelParameterDescriptor::ImageObject; + case HSAIL_ARGTYPE_REFERENCE: + return amd::KernelParameterDescriptor::ReferenceObject; + case HSAIL_ARGTYPE_VALUE: + return amd::KernelParameterDescriptor::ValueObject; + case HSAIL_ARGTYPE_SAMPLER: + return amd::KernelParameterDescriptor::SamplerObject; + case HSAIL_ARGTYPE_QUEUE: + return amd::KernelParameterDescriptor::QueueObject; + default: + return amd::KernelParameterDescriptor::HiddenNone; + } +} + inline static clk_value_type_t GetOclType(const HSAILKernel::Argument* arg) { static const clk_value_type_t ClkValueMapType[6][6] = { {T_CHAR, T_CHAR2, T_CHAR3, T_CHAR4, T_CHAR8, T_CHAR16}, @@ -422,12 +453,22 @@ void HSAILKernel::initArgList(const aclArgData* aclArg) { // Iterate through the arguments and insert into parameterList device::Kernel::parameters_t params; + device::Kernel::parameters_t hiddenParams; amd::KernelParameterDescriptor desc; size_t offset = 0; + size_t offsetStruct = argsBufferSize(); for (uint i = 0; aclArg->struct_size != 0; i++, aclArg++) { - // skip the hidden arguments - if (arguments_[i]->index_ == uint(-1)) continue; + // Allocate the hidden arguments, but abstraction layer will skip them + if (arguments_[i]->index_ == uint(-1)) { + offset = amd::alignUp(offset, arguments_[i]->alignment_); + desc.offset_ = offset; + desc.size_ = arguments_[i]->size_; + offset += arguments_[i]->size_; + desc.info_.oclObject_ = GetOclArgumentType(arguments_[i]); + hiddenParams.push_back(desc); + continue; + } desc.name_ = arguments_[i]->name_.c_str(); desc.type_ = GetOclType(arguments_[i]); @@ -435,6 +476,8 @@ void HSAILKernel::initArgList(const aclArgData* aclArg) { desc.accessQualifier_ = GetOclAccessQual(arguments_[i]); desc.typeQualifier_ = GetOclTypeQual(aclArg); desc.typeName_ = arguments_[i]->typeName_.c_str(); + desc.info_.oclObject_ = GetOclArgumentType(arguments_[i]); + desc.info_.arrayIndex_ = arguments_[i]->pointeeAlignment_; // Make a check if it is local or global if (desc.addressQualifier_ == CL_KERNEL_ARG_ADDRESS_LOCAL) { @@ -451,9 +494,32 @@ void HSAILKernel::initArgList(const aclArgData* aclArg) { // Local memory for CPU size = sizeof(cl_mem); } - offset = amd::alignUp(offset, std::min(size, size_t(16))); - desc.offset_ = offset; - offset += amd::alignUp(size, sizeof(uint32_t)); + // Check if HSAIL expects data by reference and allocate it behind + if (arguments_[i]->type_ == HSAIL_ARGTYPE_REFERENCE) { + desc.offset_ = offsetStruct; + // Align the offset reference + offset = amd::alignUp(offset, sizeof(size_t)); + patchReferences_.insert({desc.offset_, offset}); + offsetStruct += size; + // Adjust the offset of arguments + offset += sizeof(size_t); + } else { + // These objects have forced data size to uint64_t + if ((desc.info_.oclObject_ == amd::KernelParameterDescriptor::ImageObject) || + (desc.info_.oclObject_ == amd::KernelParameterDescriptor::SamplerObject) || + (desc.info_.oclObject_ == amd::KernelParameterDescriptor::QueueObject)) { + offset = amd::alignUp(offset, sizeof(uint64_t)); + desc.offset_ = offset; + offset += sizeof(uint64_t); + } else { + offset = amd::alignUp(offset, arguments_[i]->alignment_); + desc.offset_ = offset; + offset += size; + } + } + // Update read only flag + desc.info_.readOnly_ = (arguments_[i]->access_ == HSAIL_ACCESS_TYPE_RO) ? true : false; + params.push_back(desc); if (arguments_[i]->type_ == HSAIL_ARGTYPE_IMAGE) { @@ -464,7 +530,7 @@ void HSAILKernel::initArgList(const aclArgData* aclArg) { } } - createSignature(params); + createSignature(params, hiddenParams, amd::KernelSignature::ABIVersion_1); } void HSAILKernel::initHsailArgs(const aclArgData* aclArg) { @@ -869,247 +935,79 @@ void HSAILKernel::findLocalWorkSize(size_t workDim, const amd::NDRange& gblWorkS } } -template -inline void WriteAqlArg( - unsigned char** dst, //!< The write pointer to the buffer - const T* src, //!< The source pointer - uint size, //!< The size in bytes to copy - uint alignment //!< The alignment to follow while writing to the buffer -) { - *dst = amd::alignUp(*dst, alignment); - memcpy(*dst, src, size); - *dst += size; -} - -template <> -inline void WriteAqlArg( - unsigned char** dst, //!< The write pointer to the buffer - const uint32_t* src, //!< The source pointer - uint size, //!< The size in bytes to copy - uint alignment //!< The alignment to follow while writing to the buffer -) { - *dst = amd::alignUp(*dst, alignment); - *(reinterpret_cast(*dst)) = *src; - *dst += size; -} - -template <> -inline void WriteAqlArg( - unsigned char** dst, //!< The write pointer to the buffer - const uint64_t* src, //!< The source pointer - uint size, //!< The size in bytes to copy - uint alignment //!< The alignment to follow while writing to the buffer -) { - *dst = amd::alignUp(*dst, alignment); - *(reinterpret_cast(*dst)) = *src; - *dst += size; -} - -const uint16_t kDispatchPacketHeader = (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) | - (1 << HSA_PACKET_HEADER_BARRIER) | - (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) | - (HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE); - hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments( VirtualGPU& gpu, const amd::Kernel& kernel, const amd::NDRangeContainer& sizes, - const_address parameters, bool nativeMem, uint64_t vmDefQueue, uint64_t* vmParentWrap) const { - static const bool WaitOnBusyEngine = true; - uint64_t ldsAddress = ldsSize(); - address aqlArgBuf = gpu.cb(0)->SysMemCopy(); - bool srdResource = false; + const_address parameters, size_t ldsAddress, uint64_t vmDefQueue, uint64_t* vmParentWrap) const { + uint64_t argList; + address aqlArgBuf = gpu.managedBuffer().reserve( + argsBufferSize() + sizeof(hsa_kernel_dispatch_packet_t), &argList); + gpu.addVmMemory(gpu.managedBuffer().activeMemory()); if (dynamicParallelism()) { // Provide the host parent AQL wrap object to the kernel AmdAqlWrap wrap = {}; wrap.state = AQL_WRAP_BUSY; - const ConstantBuffer* cb = gpu.cb(1); - *vmParentWrap = cb->UploadDataToHw(&wrap, sizeof(AmdAqlWrap)); - gpu.addVmMemory(cb->ActiveMemory()); + *vmParentWrap = gpu.cb(1)->UploadDataToHw(&wrap, sizeof(AmdAqlWrap)); + gpu.addVmMemory(gpu.cb(1)->ActiveMemory()); } const amd::KernelSignature& signature = kernel.signature(); - const amd::KernelParameters& kernelParams = kernel.parameters(); - amd::Memory* const* memories = - reinterpret_cast(parameters + kernelParams.memoryObjOffset()); - // Find all parameters for the current kernel - for (auto arg : arguments_) { - const_address paramaddr = nullptr; - if (arg->index_ != uint(-1)) { - paramaddr = parameters + signature.at(arg->index_).offset_; - } - - // Handle the hidden arguments first, as they do not have a - // matching parameter in the OCL signature (not a valid arg->index_) - switch (arg->type_) { - case HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_X: { - size_t offset_x = sizes.dimensions() >= 1 ? sizes.offset()[0] : 0; - assert(arg->size_ == sizeof(offset_x) && "check the sizes"); - WriteAqlArg(&aqlArgBuf, &offset_x, arg->size_, arg->alignment_); + // Check if runtime has to setup hidden arguments + for (const auto& it : signature.hiddenParameters()) { + size_t offset; + switch (it.info_.oclObject_) { + case amd::KernelParameterDescriptor::HiddenNone: + //WriteAqlArgAt(aqlArgBuf, &zero, it.size_, it.offset_); break; - } - case HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Y: { - size_t offset_y = sizes.dimensions() >= 2 ? sizes.offset()[1] : 0; - assert(arg->size_ == sizeof(offset_y) && "check the sizes"); - WriteAqlArg(&aqlArgBuf, &offset_y, arg->size_, arg->alignment_); + case amd::KernelParameterDescriptor::HiddenGlobalOffsetX: + offset = sizes.offset()[0]; + WriteAqlArgAt(const_cast
(parameters), &offset, it.size_, it.offset_); break; - } - case HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Z: { - size_t offset_z = sizes.dimensions() == 3 ? sizes.offset()[2] : 0; - assert(arg->size_ == sizeof(offset_z) && "check the sizes"); - WriteAqlArg(&aqlArgBuf, &offset_z, arg->size_, arg->alignment_); + case amd::KernelParameterDescriptor::HiddenGlobalOffsetY: + if (sizes.dimensions() >= 2) { + offset = sizes.offset()[1]; + WriteAqlArgAt(const_cast
(parameters), &offset, it.size_, it.offset_); + } break; - } - case HSAIL_ARGTYPE_HIDDEN_PRINTF_BUFFER: { - size_t bufferPtr = 0; + case amd::KernelParameterDescriptor::HiddenGlobalOffsetZ: + if (sizes.dimensions() >= 3) { + offset = sizes.offset()[2]; + WriteAqlArgAt(const_cast
(parameters), &offset, it.size_, it.offset_); + } + break; + case amd::KernelParameterDescriptor::HiddenPrintfBuffer: if ((printfInfo().size() > 0) && // and printf buffer was allocated (gpu.printfDbgHSA().dbgBuffer() != nullptr)) { // and set the fourth argument as the printf_buffer pointer - bufferPtr = static_cast(gpu.printfDbgHSA().dbgBuffer()->vmAddress()); + size_t bufferPtr = static_cast(gpu.printfDbgHSA(). + dbgBuffer()->vmAddress()); gpu.addVmMemory(gpu.printfDbgHSA().dbgBuffer()); - } - assert(arg->size_ == sizeof(bufferPtr) && "check the sizes"); - WriteAqlArg(&aqlArgBuf, &bufferPtr, arg->size_, arg->alignment_); - break; - } - case HSAIL_ARGTYPE_HIDDEN_DEFAULT_QUEUE: - assert(arg->size_ == sizeof(static_cast(vmDefQueue)) && "check the sizes"); - WriteAqlArg(&aqlArgBuf, &vmDefQueue, arg->size_, arg->alignment_); - break; - case HSAIL_ARGTYPE_HIDDEN_COMPLETION_ACTION: - assert(arg->size_ == sizeof(static_cast(*vmParentWrap)) && "check the sizes"); - WriteAqlArg(&aqlArgBuf, vmParentWrap, arg->size_, arg->alignment_); - break; - case HSAIL_ARGTYPE_HIDDEN_NONE: { - void* zero = 0; - assert(arg->size_ <= sizeof(zero) && "check the sizes"); - WriteAqlArg(&aqlArgBuf, &zero, arg->size_, arg->alignment_); - break; - } - case HSAIL_ARGTYPE_POINTER: { - // If it is a local pointer - if (arg->addrQual_ == HSAIL_ADDRESS_LOCAL) { - ldsAddress = amd::alignUp(ldsAddress, arg->pointeeAlignment_); - WriteAqlArg(&aqlArgBuf, &ldsAddress, arg->size_, arg->alignment_); - ldsAddress += *reinterpret_cast(paramaddr); - break; - } - assert( - (arg->addrQual_ == HSAIL_ADDRESS_GLOBAL || arg->addrQual_ == HSAIL_ADDRESS_CONSTANT) && - "Unsupported address qualifier"); - WriteAqlArg(&aqlArgBuf, paramaddr, sizeof(paramaddr), sizeof(paramaddr)); - break; - } - case HSAIL_ARGTYPE_REFERENCE: { - const ConstantBuffer* cb = gpu.cb(1); - // Copy the current structure into CB1 - size_t gpuPtr = static_cast(cb->UploadDataToHw(paramaddr, arg->size_)); - // Then use a pointer in aqlArgBuffer to CB1 - WriteAqlArg(&aqlArgBuf, &gpuPtr, sizeof(size_t), sizeof(size_t)); - gpu.addVmMemory(cb->ActiveMemory()); - break; - } - case HSAIL_ARGTYPE_VALUE: - if (arg->size_ == sizeof(uint32_t)) { - WriteAqlArg(&aqlArgBuf, reinterpret_cast(paramaddr), - sizeof(uint32_t), arg->alignment_); - } else if (arg->size_ == sizeof(uint64_t)) { - WriteAqlArg(&aqlArgBuf, reinterpret_cast(paramaddr), - sizeof(uint64_t), arg->alignment_); - } else { - WriteAqlArg(&aqlArgBuf, paramaddr, arg->size_, arg->alignment_); + WriteAqlArgAt(const_cast
(parameters), &bufferPtr, it.size_, it.offset_); } break; - case HSAIL_ARGTYPE_IMAGE: { - Image* image = nullptr; - amd::Memory* mem = nullptr; - uint32_t index = signature.at(arg->index_).info_.arrayIndex_; - if (nativeMem) { - image = reinterpret_cast(memories)[index]; - if (nullptr != image) { - mem = image->owner(); - } - } else { - mem = memories[index]; - if (mem != nullptr) { - image = static_cast(dev().getGpuMemory(mem)); - } - } - - //! \note Special case for the image views. - //! Copy SRD to CB1, so blit manager will be able to release - //! this view without a wait for SRD resource. - if (image->memoryType() == Resource::ImageView) { - // Copy the current image SRD into CB1 - const ConstantBuffer* cb = gpu.cb(1); - uint64_t srd = cb->UploadDataToHw(image->hwState(), HsaImageObjectSize); - // Then use a pointer in aqlArgBuffer to CB1 - WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd), sizeof(srd)); - gpu.addVmMemory(cb->ActiveMemory()); - } else { - uint64_t srd = image->hwSrd(); - WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd), sizeof(srd)); - srdResource = true; - } - - if (image->desc().isDoppTexture_) { - gpu.addDoppRef(image, kernel.parameters().getExecNewVcop(), - kernel.parameters().getExecPfpaVcop()); + case amd::KernelParameterDescriptor::HiddenDefaultQueue: + if (vmDefQueue != 0) { + WriteAqlArgAt(const_cast
(parameters), &vmDefQueue, it.size_, it.offset_); } break; - } - case HSAIL_ARGTYPE_SAMPLER: { - uint32_t index = signature.at(arg->index_).info_.arrayIndex_; - const amd::Sampler* sampler = reinterpret_cast(parameters + - kernelParams.samplerObjOffset())[index]; - const Sampler* gpuSampler = static_cast(sampler->getDeviceSampler(dev())); - uint64_t srd = gpuSampler->hwSrd(); - WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd), sizeof(srd)); - srdResource = true; - break; - } - case HSAIL_ARGTYPE_QUEUE: { - uint32_t index = signature.at(arg->index_).info_.arrayIndex_; - const amd::DeviceQueue* queue = reinterpret_cast( - parameters + kernelParams.queueObjOffset())[index]; - VirtualGPU* gpuQueue = static_cast(queue->vDev()); - uint64_t vmQueue; - if (dev().settings().useDeviceQueue_) { - vmQueue = gpuQueue->vQueue()->vmAddress(); - } else { - if (!gpu.createVirtualQueue(queue->size())) { - LogError("Virtual queue creation failed!"); - return nullptr; - } - vmQueue = gpu.vQueue()->vmAddress(); + case amd::KernelParameterDescriptor::HiddenCompletionAction: + if (*vmParentWrap != 0) { + WriteAqlArgAt(const_cast
(parameters), vmParentWrap, it.size_, it.offset_); } - WriteAqlArg(&aqlArgBuf, &vmQueue, sizeof(vmQueue), sizeof(vmQueue)); break; - } - default: - LogError(" Unsupported argument type "); - return nullptr; } } - if (ldsAddress > dev().info().localMemSize_) { - LogError("No local memory available\n"); - return nullptr; - } + // Load all kernel arguments + WriteAqlArgAt(aqlArgBuf, parameters, signature.paramsSize(), 0); + assert(argsBufferSize() == amd::alignUp(signature.paramsSize(), 16) && + "A mismatch of sizes of arguments between compiler and runtime!"); -#if defined(WITH_LIGHTNING_COMPILER) - // Check there is no arguments' buffer overflow. We may not use all the - // hidden argument slots. - assert(aqlArgBuf <= (gpu.cb(0)->SysMemCopy() + argsBufferSize())); -#else // !defined(WITH_LIGHTNING_COMPILER) - // HSAIL kernarg segment size is rounded up to multiple of 16. - aqlArgBuf = amd::alignUp(aqlArgBuf, 16); - assert((aqlArgBuf == (gpu.cb(0)->SysMemCopy() + argsBufferSize())) && - "Size and the number of arguments don't match!"); -#endif // !defined(WITH_LIGHTNING_COMPILER) - hsa_kernel_dispatch_packet_t* hsaDisp = - reinterpret_cast(gpu.cb(0)->SysMemCopy() + argsBufferSize()); + //hsa_kernel_dispatch_packet_t disp; + hsa_kernel_dispatch_packet_t* hsaDisp = reinterpret_cast( + gpu.cb(0)->SysMemCopy()); amd::NDRange local(sizes.local()); const amd::NDRange& global = sizes.global(); @@ -1117,6 +1015,12 @@ hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments( // Check if runtime has to find local workgroup size findLocalWorkSize(sizes.dimensions(), sizes.global(), local); + constexpr uint16_t kDispatchPacketHeader = + (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) | + (1 << HSA_PACKET_HEADER_BARRIER) | + (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) | + (HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE); + hsaDisp->header = kDispatchPacketHeader; hsaDisp->setup = sizes.dimensions(); @@ -1134,28 +1038,16 @@ hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments( hsaDisp->group_segment_size = ldsAddress - ldsSize(); hsaDisp->kernel_object = gpuAqlCode(); - const ConstantBuffer* cb = gpu.cb(0); - uint64_t argList = cb->UploadDataToHw( - argsBufferSize() + sizeof(hsa_kernel_dispatch_packet_t)); - hsaDisp->kernarg_address = reinterpret_cast(argList); hsaDisp->reserved2 = 0; hsaDisp->completion_signal.handle = 0; + memcpy(aqlArgBuf + argsBufferSize(), hsaDisp, sizeof(hsa_kernel_dispatch_packet_t)); - gpu.addVmMemory(cb->ActiveMemory()); - gpu.addVmMemory(&prog().codeSegGpu()); - for (pal::Memory* mem : prog().globalStores()) { - gpu.addVmMemory(mem); - } if (AMD_HSA_BITS_GET(cpuAqlCode_->kernel_code_properties, - AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_QUEUE_PTR)) { + AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_QUEUE_PTR)) { gpu.addVmMemory(gpu.hsaQueueMem()); } - if (srdResource || prog().isStaticSampler()) { - dev().srds().fillResourceList(gpu); - } - return hsaDisp; } @@ -1398,6 +1290,8 @@ static inline cl_kernel_arg_type_qualifier GetOclTypeQual(const KernelArgMD& lcA void LightningKernel::initArgList(const KernelMD& kernelMD) { device::Kernel::parameters_t params; + device::Kernel::parameters_t hiddenParams; + size_t offsetStruct = argsBufferSize(); size_t offset = 0; @@ -1426,20 +1320,27 @@ void LightningKernel::initArgList(const KernelMD& kernelMD) { arg->index_ = isHidden ? uint(-1) : params.size(); arguments_.push_back(arg); - - if (isHidden) { - continue; - } - // Initialize Device kernel parameters amd::KernelParameterDescriptor desc; + if (isHidden) { + offset = amd::alignUp(offset, arguments_[i]->alignment_); + desc.offset_ = offset; + desc.size_ = arguments_[i]->size_; + offset += arguments_[i]->size_; + desc.info_.oclObject_ = GetOclArgumentType(arguments_[i]); + hiddenParams.push_back(desc); + continue; + } + desc.name_ = lcArg.mName.c_str(); desc.type_ = GetOclType(arg); desc.addressQualifier_ = GetOclAddrQual(arg); desc.accessQualifier_ = GetOclAccessQual(arg); desc.typeQualifier_ = GetOclTypeQual(lcArg); desc.typeName_ = lcArg.mTypeName.c_str(); + desc.info_.oclObject_ = GetOclArgumentType(arg); + desc.info_.arrayIndex_ = arg->pointeeAlignment_; // Make a check if it is local or global if (desc.addressQualifier_ == CL_KERNEL_ARG_ADDRESS_LOCAL) { @@ -1456,14 +1357,37 @@ void LightningKernel::initArgList(const KernelMD& kernelMD) { // Local memory for CPU size = sizeof(cl_mem); } - offset = (size_t)amd::alignUp(offset, std::min(size, size_t(16))); - desc.offset_ = offset; - offset += amd::alignUp(size, sizeof(uint32_t)); + // Check if HSAIL expects data by reference and allocate it behind + if (arguments_[i]->type_ == HSAIL_ARGTYPE_REFERENCE) { + desc.offset_ = offsetStruct; + // Align the offset reference + offset = amd::alignUp(offset, sizeof(size_t)); + patchReferences_.insert({ desc.offset_, offset }); + offsetStruct += size; + // Adjust the offset of arguments + offset += sizeof(size_t); + } + else { + // These objects have forced data size to uint64_t + if ((desc.info_.oclObject_ == amd::KernelParameterDescriptor::ImageObject) || + (desc.info_.oclObject_ == amd::KernelParameterDescriptor::SamplerObject) || + (desc.info_.oclObject_ == amd::KernelParameterDescriptor::QueueObject)) { + offset = amd::alignUp(offset, sizeof(uint64_t)); + desc.offset_ = offset; + offset += sizeof(uint64_t); + } else { + offset = amd::alignUp(offset, arguments_[i]->alignment_); + desc.offset_ = offset; + offset += size; + } + } + // Update read only flag + desc.info_.readOnly_ = (arguments_[i]->access_ == HSAIL_ACCESS_TYPE_RO) ? true : false; params.push_back(desc); } - createSignature(params); + createSignature(params, hiddenParams, amd::KernelSignature::ABIVersion_1); } static const KernelMD* FindKernelMetadata(const CodeObjectMD* programMD, const std::string& name) { diff --git a/projects/clr/rocclr/runtime/device/pal/palkernel.hpp b/projects/clr/rocclr/runtime/device/pal/palkernel.hpp index 66e4132055..7ffc144c8b 100644 --- a/projects/clr/rocclr/runtime/device/pal/palkernel.hpp +++ b/projects/clr/rocclr/runtime/device/pal/palkernel.hpp @@ -182,7 +182,7 @@ class HSAILKernel : public device::Kernel { const amd::Kernel& kernel, //!< AMD kernel object const amd::NDRangeContainer& sizes, //!< NDrange container const_address parameters, //!< Application arguments for the kernel - bool nativeMem, //!< Native memory objects are passed + size_t ldsAddress, //!< LDS address that includes all arguments. uint64_t vmDefQueue, //!< GPU VM default queue pointer uint64_t* vmParentWrap //!< GPU VM parent aql wrap object ) const; @@ -204,6 +204,8 @@ class HSAILKernel : public device::Kernel { return waveLimiter_.getWavesPerSH(vdev); }; + const std::unordered_map& patch() const { return patchReferences_; } + private: //! Disable copy constructor HSAILKernel(const HSAILKernel&); @@ -234,6 +236,7 @@ class HSAILKernel : public device::Kernel { const HSAILProgram& prog_; //!< Reference to the parent program std::vector printf_; //!< Format strings for GPU printf support uint index_; //!< Kernel index in the program + std::unordered_map patchReferences_; //!< Patch table for references uint64_t code_; //!< GPU memory pointer to the kernel size_t codeSize_; //!< Size of ISA code diff --git a/projects/clr/rocclr/runtime/device/pal/palmemory.hpp b/projects/clr/rocclr/runtime/device/pal/palmemory.hpp index 3f47b22e12..00fd8736d3 100644 --- a/projects/clr/rocclr/runtime/device/pal/palmemory.hpp +++ b/projects/clr/rocclr/runtime/device/pal/palmemory.hpp @@ -219,6 +219,8 @@ class Image : public pal::Memory { size_t* slicePitch = NULL //!< Slice for the mapped memory ); + virtual uint64_t virtualAddress() const override { return hwSrd(); } + private: //! Disable copy constructor Image(const Image&); diff --git a/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp b/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp index b2e373a4f5..7c45951176 100644 --- a/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp +++ b/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp @@ -461,9 +461,8 @@ void VirtualGPU::MemoryDependency::validate(VirtualGPU& gpu, const Memory* memor if (flushL1Cache) { // Flush cache if (!gpu.profiling()) { - gpu.addBarrier(); + gpu.addBarrier(); } - // Clear memory dependency state const static bool All = true; clear(!All); @@ -2112,13 +2111,12 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const return false; } } - + size_t ldsSize; // Check memory dependency and SVM objects - if (!processMemObjectsHSA(kernel, parameters, nativeMem)) { + if (!processMemObjectsHSA(kernel, parameters, nativeMem, ldsSize)) { LogError("Wrong memory objects!"); return false; } - bool needFlush = false; // Avoid flushing when PerfCounter is enabled, to make sure PerfStart/dispatch/PerfEnd // are in the same cmdBuffer @@ -2194,7 +2192,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const uint64_t vmParentWrap = 0; // Program the kernel arguments for the GPU execution hsa_kernel_dispatch_packet_t* aqlPkt = hsaKernel.loadArguments( - *this, kernel, tmpSizes, parameters, nativeMem, vmDefQueue, &vmParentWrap); + *this, kernel, tmpSizes, parameters, ldsSize, vmDefQueue, &vmParentWrap); if (nullptr == aqlPkt) { LogError("Couldn't load kernel arguments"); return false; @@ -2948,7 +2946,7 @@ void VirtualGPU::profileEvent(EngineType engine, bool type) const { } bool VirtualGPU::processMemObjectsHSA(const amd::Kernel& kernel, const_address params, - bool nativeMem) { + bool nativeMem, size_t& ldsAddress) { const amd::KernelParameters& kernelParams = kernel.parameters(); // Mark the tracker with a new kernel, @@ -3015,68 +3013,155 @@ bool VirtualGPU::processMemObjectsHSA(const amd::Kernel& kernel, const_address p } } + bool srdResource = false; amd::Memory* const* memories = reinterpret_cast(params + kernelParams.memoryObjOffset()); const HSAILKernel& hsaKernel = static_cast(*(kernel.getDeviceKernel(dev()))); const amd::KernelSignature& signature = kernel.signature(); + ldsAddress = hsaKernel.ldsSize(); - // Check all parameters for the current kernel - for (size_t i = 0; i < signature.numParameters(); ++i) { - const amd::KernelParameterDescriptor& desc = signature.at(i); - const HSAILKernel::Argument* arg = hsaKernel.argumentAt(i); - - // Find if current argument is a buffer - if ((desc.type_ == T_POINTER) && (arg->addrQual_ != HSAIL_ADDRESS_LOCAL)) { - Memory* gpuMem = nullptr; - amd::Memory* mem = nullptr; - uint32_t index = desc.info_.arrayIndex_; - if (nativeMem) { - gpuMem = reinterpret_cast(memories)[index]; - if (nullptr != gpuMem) { - mem = gpuMem->owner(); - } - } else { - mem = memories[index]; - if (mem != nullptr) { - gpuMem = dev().getGpuMemory(mem); - // Synchronize data with other memory instances if necessary - gpuMem->syncCacheFromHost(*this); - } - } - //! This condition is for SVM fine-grain - if ((gpuMem == nullptr) && dev().isFineGrainedSystem(true)) { - addBarrier(); - // Clear memory dependency state - const static bool All = true; - memoryDependency().clear(!All); - continue; - } else if (gpuMem != nullptr) { - // Check image - bool readOnly = (desc.accessQualifier_ == CL_KERNEL_ARG_ACCESS_READ_ONLY) ? true : false; - // Check buffer - readOnly |= (arg->access_ == HSAIL_ACCESS_TYPE_RO) ? true : false; - // Validate memory for a dependency in the queue - memoryDependency().validate(*this, gpuMem, readOnly); - - // Wait for resource if it was used on an inactive engine - //! \note syncCache may call DRM transfer - constexpr bool WaitOnBusyEngine = true; - gpuMem->wait(*this, WaitOnBusyEngine); - - //! Check if compiler expects read/write - if ((mem != nullptr) && !desc.info_.readOnly_) { - mem->signalWrite(&dev()); - } - addVmMemory(gpuMem); + if (!nativeMem) { + // Process cache coherency first, since the extra transfers may affect + // other mem dependency tracking logic: TS and signalWrite() + for (uint i = 0; i < signature.numMemories(); ++i) { + amd::Memory* mem = memories[i]; + if (mem != nullptr) { + // Synchronize data with other memory instances if necessary + dev().getGpuMemory(mem)->syncCacheFromHost(*this); } } } - for (pal::Memory* mem : hsaKernel.prog().globalStores()) { + // Check all parameters for the current kernel + for (size_t i = 0; i < signature.numParameters(); ++i) { + const amd::KernelParameterDescriptor& desc = signature.at(i); + const amd::KernelParameterDescriptor::InfoData& info = desc.info_; + + // Find if current argument is a buffer + if (desc.type_ == T_POINTER) { + // If it is a local pointer + if (desc.size_ == 0) { + ldsAddress = amd::alignUp(ldsAddress, desc.info_.arrayIndex_); + // Save the original LDS size + size_t ldsSize = *reinterpret_cast(params + desc.offset_); + // Patch the LDS address in the original arguments with an LDS address(offset) + WriteAqlArgAt(const_cast
(params), &ldsAddress, sizeof(void*), desc.offset_); + // Add the original size + ldsAddress += ldsSize; + } else { + Memory* gpuMem = nullptr; + amd::Memory* mem = nullptr; + uint32_t index = info.arrayIndex_; + if (nativeMem) { + gpuMem = reinterpret_cast(memories)[index]; + if (nullptr != gpuMem) { + mem = gpuMem->owner(); + } + } else { + mem = memories[index]; + if (mem != nullptr) { + gpuMem = dev().getGpuMemory(mem); + } + } + //! This condition is for SVM fine-grain + if ((gpuMem == nullptr) && dev().isFineGrainedSystem(true)) { + addBarrier(); + // Clear memory dependency state + const static bool All = true; + memoryDependency().clear(!All); + continue; + } else if (gpuMem != nullptr) { + // Validate memory for a dependency in the queue + memoryDependency().validate(*this, gpuMem, info.readOnly_); + // Wait for resource if it was used on an inactive engine + //! \note syncCache may call DRM transfer + constexpr bool WaitOnBusyEngine = true; + gpuMem->wait(*this, WaitOnBusyEngine); + + addVmMemory(gpuMem); + + //! Check if compiler expects read/write. + //! Note: SVM with subbuffers has an issue with tracking. + //! Conformance can send read only subbuffer, but update the region + //! in the kernel. + if ((mem != nullptr) && + ((!info.readOnly_ && (mem->getSvmPtr() == nullptr)) || + ((mem->getMemFlags() & CL_MEM_READ_ONLY) == 0))) { + mem->signalWrite(&dev()); + } + if (info.oclObject_ == amd::KernelParameterDescriptor::ImageObject) { + //! \note Special case for the image views. + //! Copy SRD to CB1, so blit manager will be able to release + //! this view without a wait for SRD resource. + if (gpuMem->memoryType() == Resource::ImageView) { + // Copy the current image SRD into CB1 + uint64_t srd = cb(1)->UploadDataToHw(gpuMem->hwState(), HsaImageObjectSize); + // Then use a pointer in aqlArgBuffer to CB1 + // Patch the GPU VA address in the original arguments + WriteAqlArgAt(const_cast
(params), &srd, sizeof(srd), desc.offset_); + addVmMemory(cb(1)->ActiveMemory()); + } else { + srdResource = true; + } + if (gpuMem->desc().isDoppTexture_) { + addDoppRef(gpuMem, kernel.parameters().getExecNewVcop(), + kernel.parameters().getExecPfpaVcop()); + } + } + } + } + } + else if (desc.type_ == T_VOID) { + if (desc.info_.oclObject_ == amd::KernelParameterDescriptor::ReferenceObject) { + // Copy the current structure into CB1 + size_t gpuPtr = static_cast(cb(1)->UploadDataToHw(params, desc.size_)); + // Then use a pointer in aqlArgBuffer to CB1 + const auto it = hsaKernel.patch().find(desc.offset_); + // Patch the GPU VA address in the original arguments + WriteAqlArgAt(const_cast
(params), &gpuPtr, sizeof(size_t), it->second); + addVmMemory(cb(1)->ActiveMemory()); + } + } + else if (desc.type_ == T_SAMPLER) { + srdResource = true; + } else if (desc.type_ == T_QUEUE) { + uint32_t index = desc.info_.arrayIndex_; + const amd::DeviceQueue* queue = reinterpret_cast( + params + kernelParams.queueObjOffset())[index]; + VirtualGPU* gpuQueue = static_cast(queue->vDev()); + uint64_t vmQueue; + if (dev().settings().useDeviceQueue_) { + vmQueue = gpuQueue->vQueue()->vmAddress(); + } else { + if (!createVirtualQueue(queue->size())) { + LogError("Virtual queue creation failed!"); + return false; + } + vmQueue = vQueue()->vmAddress(); + } + // Patch the GPU VA address in the original arguments + WriteAqlArgAt(const_cast
(params), &vmQueue, sizeof(vmQueue), desc.offset_); + break; + } + } + + if (ldsAddress > dev().info().localMemSize_) { + LogError("No local memory available\n"); + return false; + } + + if (srdResource || hsaKernel.prog().isStaticSampler()) { + dev().srds().fillResourceList(*this); + } + + addVmMemory(&hsaKernel.prog().codeSegGpu()); + + for (const pal::Memory* mem : hsaKernel.prog().globalStores()) { const static bool IsReadOnly = false; // Validate global store for a dependency in the queue memoryDependency().validate(*this, mem, IsReadOnly); + addVmMemory(mem); } return true; diff --git a/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp b/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp index fa48024c9d..fccee6d60e 100644 --- a/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp +++ b/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp @@ -378,6 +378,9 @@ class VirtualGPU : public device::VirtualDevice { //! Return xfer buffer for staging operations XferBuffer& xferWrite() { return writeBuffer_; } + //! Return managed buffer for staging operations + ManagedBuffer& managedBuffer() { return managedBuffer_; } + //! Adds a pinned memory object into a map void addPinnedMem(amd::Memory* mem); @@ -529,7 +532,8 @@ class VirtualGPU : public device::VirtualDevice { //! Detects memory dependency for HSAIL kernels and flushes caches bool processMemObjectsHSA(const amd::Kernel& kernel, //!< AMD kernel object for execution const_address params, //!< Pointer to the param's store - bool nativeMem //!< Native memory objects + bool nativeMem, //!< Native memory objects + size_t& ldsAddess //!< Returns LDS size, used in the kernel ); //! Common function for fill memory used by both svm Fill and non-svm fill @@ -644,4 +648,33 @@ uint VirtualGPU::Queue::submit(bool forceFlush) { return id; } +template +inline void WriteAqlArgAt( + unsigned char* dst, //!< The write pointer to the buffer + const T* src, //!< The source pointer + uint size, //!< The size in bytes to copy + size_t offset //!< The alignment to follow while writing to the buffer +) { + memcpy(dst + offset, src, size); +} + +template <> +inline void WriteAqlArgAt( + unsigned char* dst, //!< The write pointer to the buffer + const uint32_t* src, //!< The source pointer + uint size, //!< The size in bytes to copy + size_t offset //!< The alignment to follow while writing to the buffer +) { + *(reinterpret_cast(dst + offset)) = *src; +} + +template <> +inline void WriteAqlArgAt( + unsigned char* dst, //!< The write pointer to the buffer + const uint64_t* src, //!< The source pointer + uint size, //!< The size in bytes to copy + size_t offset //!< The alignment to follow while writing to the buffer +) { + *(reinterpret_cast(dst + offset)) = *src; +} /*@}*/} // namespace pal diff --git a/projects/clr/rocclr/runtime/device/rocm/rockernel.cpp b/projects/clr/rocclr/runtime/device/rocm/rockernel.cpp index f0bf4e95c6..b7cf1ed1c3 100644 --- a/projects/clr/rocclr/runtime/device/rocm/rockernel.cpp +++ b/projects/clr/rocclr/runtime/device/rocm/rockernel.cpp @@ -581,7 +581,8 @@ void HSAILKernel::initArguments(const aclArgData* aclArg) { params.push_back(desc); } - createSignature(params); + device::Kernel::parameters_t hiddenParams; + createSignature(params, hiddenParams, amd::KernelSignature::ABIVersion_0); } #endif // defined(WITH_COMPILER_LIB) @@ -660,8 +661,8 @@ void LightningKernel::initArguments(const KernelMD& kernelMD) { params.push_back(desc); } - - createSignature(params); + device::Kernel::parameters_t hiddenParams; + createSignature(params, hiddenParams, amd::KernelSignature::ABIVersion_0); } #endif // defined(WITH_LIGHTNING_COMPILER) diff --git a/projects/clr/rocclr/runtime/platform/kernel.cpp b/projects/clr/rocclr/runtime/platform/kernel.cpp index 38fc9cabe0..a4616033aa 100644 --- a/projects/clr/rocclr/runtime/platform/kernel.cpp +++ b/projects/clr/rocclr/runtime/platform/kernel.cpp @@ -243,13 +243,17 @@ void KernelParameters::release(address mem, const amd::Device& device) const { } KernelSignature::KernelSignature(const std::vector& params, - const std::string& attrib) + const std::string& attrib, + const std::vector& hiddenParams, + uint32_t version) : params_(params) + , hiddenParams_(hiddenParams) , attributes_(attrib) , paramsSize_(0) , numMemories_(0) , numSamplers_(0) - , numQueues_(0) { + , numQueues_(0) + , version_(version) { size_t maxOffset = 0; size_t last = 0; // Find the last entry @@ -283,7 +287,15 @@ KernelSignature::KernelSignature(const std::vector& p if (lastSize == 0 /* local mem */) { lastSize = sizeof(cl_mem); } - paramsSize_ = params[last].offset_ + alignUp(lastSize, sizeof(intptr_t)); + // Note: It's a special case. HW ABI expects 64 bit for SRD, regardless of the binary. + // Force the size to 64 bit for those cases. + if ((params[last].info_.oclObject_ == amd::KernelParameterDescriptor::ImageObject) || + (params[last].info_.oclObject_ == amd::KernelParameterDescriptor::SamplerObject) || + (params[last].info_.oclObject_ == amd::KernelParameterDescriptor::QueueObject)) { + lastSize = alignUp(lastSize, sizeof(uint64_t)); + } + paramsSize_ = params[last].offset_ + lastSize; + paramsSize_ = alignUp(paramsSize_, sizeof(intptr_t)); } } } // namespace amd diff --git a/projects/clr/rocclr/runtime/platform/kernel.hpp b/projects/clr/rocclr/runtime/platform/kernel.hpp index 838c5d7198..f506241ee0 100644 --- a/projects/clr/rocclr/runtime/platform/kernel.hpp +++ b/projects/clr/rocclr/runtime/platform/kernel.hpp @@ -36,18 +36,30 @@ class Program; class KernelSignature : public HeapObject { private: std::vector params_; + std::vector hiddenParams_; std::string attributes_; //!< The kernel attributes uint32_t paramsSize_; uint32_t numMemories_; uint32_t numSamplers_; uint32_t numQueues_; + uint32_t version_; public: + enum { + ABIVersion_0 = 0, //! ABI constructed based on the OCL semantics + ABIVersion_1 = 1 //! ABI constructed based on the HW ABI returned from the compiler + }; + //! Default constructor - KernelSignature() : paramsSize_(0), numMemories_(0), numSamplers_(0), numQueues_(0) {} + KernelSignature(): + paramsSize_(0), numMemories_(0), numSamplers_(0), + numQueues_(0), version_(ABIVersion_0) {} //! Construct a new signature. - KernelSignature(const std::vector& params, const std::string& attrib); + KernelSignature(const std::vector& params, + const std::string& attrib, + const std::vector& hiddenParams, + uint32_t version); //! Return the number of parameters size_t numParameters() const { return params_.size(); } @@ -72,8 +84,17 @@ class KernelSignature : public HeapObject { //! Returns the number of queue objects. uint32_t numQueues() const { return numQueues_; } + //! Returns the signature version + uint32_t version() const { return version_; } + //! Return the kernel attributes const std::string& attributes() const { return attributes_; } + + const std::vector& hiddenParameters() const + { return hiddenParams_; } + + const std::vector& parameters() const + { return params_; } }; // @todo: look into a copy-on-write model instead of copy-on-read. diff --git a/projects/clr/rocclr/runtime/platform/program.cpp b/projects/clr/rocclr/runtime/platform/program.cpp index 055c351b07..9b68004437 100644 --- a/projects/clr/rocclr/runtime/platform/program.cpp +++ b/projects/clr/rocclr/runtime/platform/program.cpp @@ -604,8 +604,8 @@ bool Program::ParseAllOptions(const std::string& options, option::Options& parse } bool Symbol::setDeviceKernel(const Device& device, const device::Kernel* func) { - // FIXME_lmoriche: check that the signatures are compatible - if (deviceKernels_.size() == 0) { + if (deviceKernels_.size() == 0 || + (func->signature().version() > KernelSignature::ABIVersion_0)) { signature_ = func->signature(); } deviceKernels_[&device] = func;