From 9cd2db67f6fcff17111a3af11da54f2c56a0d2be Mon Sep 17 00:00:00 2001
From: foreman <dl.constructicon@amd.com>
Date: Fri, 3 Aug 2018 16:05:12 -0400
Subject: [PATCH] P4 to Git Change 1589476 by axie@axie-rocm-opencl on
 2018/08/03 15:54:24

	SWDEV-79445 - OCL generic changes and code clean-up
	- Optimize setup of kernel arguments.
	- Add HW ABI support in the abstraction layer
	- Remove arguments parsing loop from the kernel launch. Memory processing will be responsible for dependency tracking and  patching of arguments.

	ReviewBoardURL = http://ocltc.amd.com/reviews/r/15400/

	Tests:
	1. ./run_conformance.py ./opencl_conformance_tests_reallyquick.csv CL_DEVICE_TYPE_GPU for openCL 1.2: OpenCL-GL sharing failed. This is not a regression.
	2. ./ocltst -m oclruntime.so -A oclruntime.exclude
	3. ./run_conformance.py opencl_conformance_tests_lightning.csv CL_DEVICE_TYPE_GPU : PASS
	4. teamcity test: http://ocltc.amd.com:8111/viewModification.html?modId=104598&personal=true&buildTypeId=&tab=vcsModificationBuilds&show_all_builds=true

Affected files ...

... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rockernel.cpp#39 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rockernel.hpp#23 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocsettings.cpp#34 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocsettings.hpp#14 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocvirtual.cpp#60 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocvirtual.hpp#17 edit
---
 rocclr/runtime/device/rocm/rockernel.cpp   | 134 +++++-
 rocclr/runtime/device/rocm/rockernel.hpp   |   3 +
 rocclr/runtime/device/rocm/rocsettings.cpp |   1 -
 rocclr/runtime/device/rocm/rocsettings.hpp |   1 -
 rocclr/runtime/device/rocm/rocvirtual.cpp  | 459 +++++++++------------
 rocclr/runtime/device/rocm/rocvirtual.hpp  |  33 +-
 6 files changed, 356 insertions(+), 275 deletions(-)

diff --git a/rocclr/runtime/device/rocm/rockernel.cpp b/rocclr/runtime/device/rocm/rockernel.cpp
index 68e6d96944..47268bc612 100644
--- a/rocclr/runtime/device/rocm/rockernel.cpp
+++ b/rocclr/runtime/device/rocm/rockernel.cpp
@@ -231,6 +231,37 @@ static inline ROC_ADDRESS_QUALIFIER GetKernelAddrQual(const aclArgData* argInfo)
   return ROC_ADDRESS_ERROR;
 }
 
+inline static uint32_t GetOclArgumentType(const HSAILKernel::Argument* arg) {
+  switch (arg->type_){
+    case ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_X:
+      return amd::KernelParameterDescriptor::HiddenGlobalOffsetX;
+    case ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Y:
+      return amd::KernelParameterDescriptor::HiddenGlobalOffsetY;
+    case ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Z:
+      return amd::KernelParameterDescriptor::HiddenGlobalOffsetZ;
+    case ROC_ARGTYPE_HIDDEN_PRINTF_BUFFER:
+      return amd::KernelParameterDescriptor::HiddenPrintfBuffer;
+    case ROC_ARGTYPE_HIDDEN_DEFAULT_QUEUE:
+      return amd::KernelParameterDescriptor::HiddenDefaultQueue;
+    case ROC_ARGTYPE_HIDDEN_COMPLETION_ACTION:
+      return amd::KernelParameterDescriptor::HiddenCompletionAction;
+    case ROC_ARGTYPE_POINTER:
+        return amd::KernelParameterDescriptor::MemoryObject;
+    case ROC_ARGTYPE_IMAGE:
+        return amd::KernelParameterDescriptor::ImageObject;
+    case ROC_ARGTYPE_REFERENCE:
+        return amd::KernelParameterDescriptor::ReferenceObject;
+    case ROC_ARGTYPE_VALUE:
+        return amd::KernelParameterDescriptor::ValueObject;
+    case ROC_ARGTYPE_SAMPLER:
+        return amd::KernelParameterDescriptor::SamplerObject;
+    case ROC_ARGTYPE_QUEUE:
+        return amd::KernelParameterDescriptor::QueueObject;
+    default:
+      return amd::KernelParameterDescriptor::HiddenNone;
+  }
+}
+
 #if defined(WITH_LIGHTNING_COMPILER)
 static inline ROC_DATA_TYPE GetKernelDataType(const KernelArgMD& lcArg) {
   aclArgDataType dataType;
@@ -514,6 +545,8 @@ static inline cl_kernel_arg_type_qualifier GetOclTypeQual(const aclArgData* argI
 #if defined(WITH_COMPILER_LIB)
 void HSAILKernel::initArguments(const aclArgData* aclArg) {
   device::Kernel::parameters_t params;
+  device::Kernel::parameters_t hiddenParams;
+  size_t offsetStruct = KernargSegmentByteSize();
 
   // Iterate through the arguments and insert into parameterList
   for (size_t offset = 0; aclArg->struct_size != 0; aclArg++) {
@@ -539,17 +572,27 @@ void HSAILKernel::initArguments(const aclArgData* aclArg) {
     arg->index_ = isHidden ? uint(-1) : params.size();
     hsailArgList_.push_back(arg);
 
+    amd::KernelParameterDescriptor desc;
+
+    // Allocate the hidden arguments, but abstraction layer will skip them
     if (isHidden) {
+      offset = amd::alignUp(offset, arg->alignment_);
+      desc.offset_ = offset;
+      desc.size_ = arg->size_;
+      offset += arg->size_;
+      desc.info_.oclObject_ = GetOclArgumentType(arg);
+      hiddenParams.push_back(desc);
       continue;
     }
 
-    amd::KernelParameterDescriptor desc;
     desc.name_ = arg->name_.c_str();
     desc.type_ = GetOclType(arg);
     desc.addressQualifier_ = GetOclAddrQual(arg);
     desc.accessQualifier_ = GetOclAccessQual(arg);
     desc.typeQualifier_ = GetOclTypeQual(aclArg);
     desc.typeName_ = arg->typeName_.c_str();
+    desc.info_.oclObject_ = GetOclArgumentType(arg);
+    desc.info_.arrayIndex_ = arg->pointeeAlignment_;
 
     // set image related flags
     if (arg->type_ == ROC_ARGTYPE_IMAGE) {
@@ -566,19 +609,48 @@ void HSAILKernel::initArguments(const aclArgData* aclArg) {
     // and CPU sends the parameters as they are allocated in memory
     size_t size = desc.size_;
 
-    offset = amd::alignUp(offset, std::min(size, size_t(16)));
-    desc.offset_ = offset;
-    offset += amd::alignUp(size, sizeof(uint32_t));
+    // Check if HSAIL expects data by reference and allocate it behind
+    if (arg->type_ == ROC_ARGTYPE_REFERENCE) {
+      desc.offset_ = offsetStruct;
+      // Align the offset reference
+      offset = amd::alignUp(offset, sizeof(size_t));
+      patchReferences_.insert({desc.offset_, offset});
+      offsetStruct += size;
+      // Adjust the offset of arguments
+      offset += sizeof(size_t);
+    }
+    else if ((desc.info_.oclObject_ == amd::KernelParameterDescriptor::ImageObject) ||
+        (desc.info_.oclObject_ == amd::KernelParameterDescriptor::SamplerObject) ||
+        (desc.info_.oclObject_ == amd::KernelParameterDescriptor::QueueObject)) {
+      // These objects have forced data size to uint64_t
+      offset = amd::alignUp(offset, sizeof(uint64_t));
+      desc.offset_ = offset;
+      offset += sizeof(uint64_t);
+    } else {
+      offset = amd::alignUp(offset, arg->alignment_);
+      desc.offset_ = offset;
+      offset += size;
+    }
+
+    // Update read only flag
+    desc.info_.readOnly_ = (arg->access_ == ROC_ACCESS_TYPE_RO) ? true : false;
 
     params.push_back(desc);
   }
-  createSignature(params, params.size(), amd::KernelSignature::ABIVersion_0);
+
+  // Save the number of OCL arguments
+  uint32_t numParams = params.size();
+  // Append the hidden arguments to the OCL arguments
+  params.insert(params.end(), hiddenParams.begin(), hiddenParams.end());
+  createSignature(params, numParams, amd::KernelSignature::ABIVersion_1);
 }
 #endif // defined(WITH_COMPILER_LIB)
 
 #if defined(WITH_LIGHTNING_COMPILER)
 void LightningKernel::initArguments(const KernelMD& kernelMD) {
   device::Kernel::parameters_t params;
+  device::Kernel::parameters_t hiddenParams;
+  size_t offsetStruct = KernargSegmentByteSize();
 
   size_t offset = 0;
 
@@ -607,19 +679,27 @@ void LightningKernel::initArguments(const KernelMD& kernelMD) {
     arg->index_ = isHidden ? uint(-1) : params.size();
     hsailArgList_.push_back(arg);
 
-    if (isHidden) {
-      continue;
-    }
-
     // Initialize Device kernel parameters
     amd::KernelParameterDescriptor desc;
 
+    if (isHidden) {
+      offset = amd::alignUp(offset, arg->alignment_);
+      desc.offset_ = offset;
+      desc.size_ = arg->size_;
+      offset += arg->size_;
+      desc.info_.oclObject_ = GetOclArgumentType(arg);
+      hiddenParams.push_back(desc);
+      continue;
+    }
+
     desc.name_ = lcArg.mName.c_str();
     desc.type_ = GetOclType(arg);
     desc.addressQualifier_ = GetOclAddrQual(arg);
     desc.accessQualifier_ = GetOclAccessQual(arg);
     desc.typeQualifier_ = GetOclTypeQual(lcArg);
     desc.typeName_ = lcArg.mTypeName.c_str();
+    desc.info_.oclObject_ = GetOclArgumentType(arg);
+    desc.info_.arrayIndex_ = arg->pointeeAlignment_;
 
     // set image related flags
     if (arg->type_ == ROC_ARGTYPE_IMAGE) {
@@ -629,6 +709,7 @@ void LightningKernel::initArguments(const KernelMD& kernelMD) {
         flags_.imageWrite_ = true;
       }
     }
+
     desc.size_ = arg->size_;
 
     // Make offset alignment to match CPU metadata, since
@@ -636,13 +717,40 @@ void LightningKernel::initArguments(const KernelMD& kernelMD) {
     // and CPU sends the parameters as they are allocated in memory
     size_t size = desc.size_;
 
-    offset = (size_t)amd::alignUp(offset, std::min(size, size_t(16)));
-    desc.offset_ = offset;
-    offset += amd::alignUp(size, sizeof(uint32_t));
+    // Check if HSAIL expects data by reference and allocate it behind
+    if (arg->type_ == ROC_ARGTYPE_REFERENCE) {
+      desc.offset_ = offsetStruct;
+      // Align the offset reference
+      offset = amd::alignUp(offset, sizeof(size_t));
+      patchReferences_.insert({desc.offset_, offset});
+      offsetStruct += size;
+      // Adjust the offset of arguments
+      offset += sizeof(size_t);
+    }
+    else if ((desc.info_.oclObject_ == amd::KernelParameterDescriptor::ImageObject) ||
+        (desc.info_.oclObject_ == amd::KernelParameterDescriptor::SamplerObject) ||
+        (desc.info_.oclObject_ == amd::KernelParameterDescriptor::QueueObject)) {
+      // These objects have forced data size to uint64_t
+      offset = amd::alignUp(offset, sizeof(uint64_t));
+      desc.offset_ = offset;
+      offset += sizeof(uint64_t);
+    } else {
+      offset = amd::alignUp(offset, arg->alignment_);
+      desc.offset_ = offset;
+      offset += size;
+    }
+
+    // Update read only flag
+    desc.info_.readOnly_ = (arg->access_ == ROC_ACCESS_TYPE_RO) ? true : false;
 
     params.push_back(desc);
   }
-  createSignature(params, params.size(), amd::KernelSignature::ABIVersion_0);
+
+  // Save the number of OCL arguments
+  uint32_t numParams = params.size();
+  // Append the hidden arguments to the OCL arguments
+  params.insert(params.end(), hiddenParams.begin(), hiddenParams.end());
+  createSignature(params, numParams, amd::KernelSignature::ABIVersion_1);
 }
 #endif  // defined(WITH_LIGHTNING_COMPILER)
 
diff --git a/rocclr/runtime/device/rocm/rockernel.hpp b/rocclr/runtime/device/rocm/rockernel.hpp
index f0b8690e71..0c1c0f7e18 100644
--- a/rocclr/runtime/device/rocm/rockernel.hpp
+++ b/rocclr/runtime/device/rocm/rockernel.hpp
@@ -140,6 +140,8 @@ class Kernel : public device::Kernel {
   //! Return TRUE if kernel wirtes images
   bool imageWrite() const { return (flags_.imageWrite_) ? true : false; }
 
+  const std::unordered_map<size_t, size_t>& patch() const { return patchReferences_; }
+
  protected:
   union Flags {
     struct {
@@ -162,6 +164,7 @@ class Kernel : public device::Kernel {
   const uint32_t kernargSegmentAlignment_;
   size_t kernelDirectiveOffset_;
   std::vector<PrintfInfo> printf_;
+  std::unordered_map<size_t, size_t> patchReferences_;  //!< Patch table for references
 };
 
 #if defined(WITH_COMPILER_LIB)
diff --git a/rocclr/runtime/device/rocm/rocsettings.cpp b/rocclr/runtime/device/rocm/rocsettings.cpp
index a768801d3b..6c6c7d71da 100644
--- a/rocclr/runtime/device/rocm/rocsettings.cpp
+++ b/rocclr/runtime/device/rocm/rocsettings.cpp
@@ -20,7 +20,6 @@ Settings::Settings() {
   pollCompletion_ = ENVVAR_HSA_POLL_KERNEL_COMPLETION;
 
   enableLocalMemory_ = HSA_LOCAL_MEMORY_ENABLE;
-  enableImageHandle_ = true;
 
   maxWorkGroupSize_ = 1024;
   preferredWorkGroupSize_ = 256;
diff --git a/rocclr/runtime/device/rocm/rocsettings.hpp b/rocclr/runtime/device/rocm/rocsettings.hpp
index 1ecd636d2a..d3c601de4c 100644
--- a/rocclr/runtime/device/rocm/rocsettings.hpp
+++ b/rocclr/runtime/device/rocm/rocsettings.hpp
@@ -22,7 +22,6 @@ class Settings : public device::Settings {
       uint doublePrecision_ : 1;        //!< Enables double precision support
       uint pollCompletion_ : 1;         //!< Enables polling in HSA
       uint enableLocalMemory_ : 1;      //!< Enable GPUVM memory
-      uint enableImageHandle_ : 1;      //!< Use HSAIL image/sampler pointer
       uint enableNCMode_ : 1;           //!< Enable Non Coherent mode for system memory
       uint enablePartialDispatch_ : 1;  //!< Enable support for Partial Dispatch
       uint imageDMA_ : 1;               //!< Enable direct image DMA transfers
diff --git a/rocclr/runtime/device/rocm/rocvirtual.cpp b/rocclr/runtime/device/rocm/rocvirtual.cpp
index 41c95d9995..9d9cdecd34 100644
--- a/rocclr/runtime/device/rocm/rocvirtual.cpp
+++ b/rocclr/runtime/device/rocm/rocvirtual.cpp
@@ -185,8 +185,37 @@ void VirtualGPU::MemoryDependency::clear(bool all) {
   }
 }
 
-bool VirtualGPU::processMemObjects(const amd::Kernel& kernel, const_address params) {
-  const Kernel& hsaKernel = static_cast<const Kernel&>(*(kernel.getDeviceKernel(dev())));
+static void fillSampleDescriptor(hsa_ext_sampler_descriptor_t& samplerDescriptor,
+                                 const amd::Sampler& sampler) {
+  samplerDescriptor.filter_mode = sampler.filterMode() == CL_FILTER_NEAREST
+      ? HSA_EXT_SAMPLER_FILTER_MODE_NEAREST
+      : HSA_EXT_SAMPLER_FILTER_MODE_LINEAR;
+  samplerDescriptor.coordinate_mode = sampler.normalizedCoords()
+      ? HSA_EXT_SAMPLER_COORDINATE_MODE_NORMALIZED
+      : HSA_EXT_SAMPLER_COORDINATE_MODE_UNNORMALIZED;
+  switch (sampler.addressingMode()) {
+    case CL_ADDRESS_CLAMP_TO_EDGE:
+      samplerDescriptor.address_mode = HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE;
+      break;
+    case CL_ADDRESS_REPEAT:
+      samplerDescriptor.address_mode = HSA_EXT_SAMPLER_ADDRESSING_MODE_REPEAT;
+      break;
+    case CL_ADDRESS_CLAMP:
+      samplerDescriptor.address_mode = HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_BORDER;
+      break;
+    case CL_ADDRESS_MIRRORED_REPEAT:
+      samplerDescriptor.address_mode = HSA_EXT_SAMPLER_ADDRESSING_MODE_MIRRORED_REPEAT;
+      break;
+    case CL_ADDRESS_NONE:
+      samplerDescriptor.address_mode = HSA_EXT_SAMPLER_ADDRESSING_MODE_UNDEFINED;
+      break;
+    default:
+      return;
+  }
+}
+
+bool VirtualGPU::processMemObjects(const amd::Kernel& kernel, const_address params, size_t& ldsAddress) {
+  Kernel& hsaKernel = const_cast<Kernel&>(static_cast<const Kernel&>(*(kernel.getDeviceKernel(dev()))));
   const amd::KernelSignature& signature = kernel.signature();
   const amd::KernelParameters& kernelParams = kernel.parameters();
 
@@ -256,38 +285,141 @@ bool VirtualGPU::processMemObjects(const amd::Kernel& kernel, const_address para
   // Check all parameters for the current kernel
   for (size_t i = 0; i < signature.numParameters(); ++i) {
     const amd::KernelParameterDescriptor& desc = signature.at(i);
-    const Kernel::Argument* arg = hsaKernel.hsailArgAt(i);
     Memory* gpuMem = nullptr;
-    bool readOnly = false;
     amd::Memory* mem = nullptr;
 
     // Find if current argument is a buffer
-    if ((desc.type_ == T_POINTER) && (arg->addrQual_ != ROC_ADDRESS_LOCAL)) {
-      uint32_t index = desc.info_.arrayIndex_;
-      mem = memories[index];
-      if (mem != nullptr) {
-        gpuMem = static_cast<Memory*>(mem->getDeviceMemory(dev()));
-        // Don't sync for internal objects,
-        // since they are not shared between devices
-        if (gpuMem->owner()->getVirtualDevice() == nullptr) {
-          // Synchronize data with other memory instances if necessary
-          gpuMem->syncCacheFromHost(*this);
+    if (desc.type_ == T_POINTER) {
+      if (desc.addressQualifier_ == CL_KERNEL_ARG_ADDRESS_LOCAL) {
+        // Align the LDS on the alignment requirement of type pointed to
+        ldsAddress = amd::alignUp(ldsAddress, desc.info_.arrayIndex_);
+        if (desc.size_ == 8) {
+          // Save the original LDS size
+          uint64_t ldsSize = *reinterpret_cast<const uint64_t*>(params + desc.offset_);
+          // Patch the LDS address in the original arguments with an LDS address(offset)
+          WriteAqlArgAt(const_cast<address>(params), &ldsAddress, desc.size_, desc.offset_);
+          // Add the original size
+          ldsAddress += ldsSize;
+        } else {
+          // Save the original LDS size
+          uint32_t ldsSize = *reinterpret_cast<const uint32_t*>(params + desc.offset_);
+          // Patch the LDS address in the original arguments with an LDS address(offset)
+          uint32_t ldsAddr = ldsAddress;
+          WriteAqlArgAt(const_cast<address>(params), &ldsAddr, desc.size_, desc.offset_);
+          // Add the original size
+          ldsAddress += ldsSize;
         }
       }
-      //! This condition is for SVM fine-grain
-      if ((gpuMem == nullptr) && dev().isFineGrainedSystem(true)) {
-        // Sync AQL packets
-        setAqlHeader(kDispatchPacketHeader);
-        // Clear memory dependency state
-        const static bool All = true;
-        memoryDependency().clear(!All);
-        continue;
-      } else if (gpuMem != nullptr) {
-        readOnly |= (arg->access_ == ROC_ACCESS_TYPE_RO);
-        // Validate memory for a dependency in the queue
-        memoryDependency().validate(*this, gpuMem, readOnly);
+      else {
+        uint32_t index = desc.info_.arrayIndex_;
+        mem = memories[index];
+        if (mem == nullptr) {
+          //! This condition is for SVM fine-grain
+          if (dev().isFineGrainedSystem(true)) {
+            // Sync AQL packets
+            setAqlHeader(kDispatchPacketHeader);
+            // Clear memory dependency state
+            const static bool All = true;
+            memoryDependency().clear(!All);
+          }
+        }
+        else {
+          gpuMem = static_cast<Memory*>(mem->getDeviceMemory(dev()));
+          // Don't sync for internal objects,
+          // since they are not shared between devices
+          if (gpuMem->owner()->getVirtualDevice() == nullptr) {
+            // Synchronize data with other memory instances if necessary
+            gpuMem->syncCacheFromHost(*this);
+          }
+
+          // Validate memory for a dependency in the queue
+          memoryDependency().validate(*this, gpuMem, (desc.info_.readOnly_ == 1));
+
+          assert((desc.addressQualifier_ == CL_KERNEL_ARG_ADDRESS_GLOBAL ||
+                  desc.addressQualifier_ == CL_KERNEL_ARG_ADDRESS_CONSTANT) &&
+                 "Unsupported address qualifier");
+
+          const bool readOnly =
+#if defined(WITH_LIGHTNING_COMPILER)
+          desc.typeQualifier_ == CL_KERNEL_ARG_TYPE_CONST ||
+#endif // defined(WITH_LIGHTNING_COMPILER)
+            (mem->getMemFlags() & CL_MEM_READ_ONLY) != 0;
+
+          if (!readOnly) {
+            mem->signalWrite(&dev());
+          }
+
+          if (desc.info_.oclObject_ == amd::KernelParameterDescriptor::ImageObject) {
+            Image* image = static_cast<Image*>(mem->getDeviceMemory(dev()));
+
+            const uint64_t image_srd = image->getHsaImageObject().handle;
+            assert(amd::isMultipleOf(image_srd, sizeof(image_srd)));
+            WriteAqlArgAt(const_cast<address>(params), &image_srd, sizeof(image_srd), desc.offset_);
+          }
+        }
       }
     }
+    else if (desc.type_ == T_QUEUE) {
+      uint32_t index = desc.info_.arrayIndex_;
+      const amd::DeviceQueue* queue = reinterpret_cast<amd::DeviceQueue* const*>(
+        params + kernelParams.queueObjOffset())[index];
+
+      if (!createVirtualQueue(queue->size()) || !createSchedulerParam()) {
+         return false;
+      }
+      hsaKernel.setDynamicParallelFlag(true);
+      uint64_t vqVA = getVQVirtualAddress();
+      WriteAqlArgAt(const_cast<address>(params), &vqVA, sizeof(vqVA), desc.offset_);
+    }
+    else if (desc.type_ == T_VOID) {
+      if (desc.info_.oclObject_ == amd::KernelParameterDescriptor::ReferenceObject) {
+        const_address srcArgPtr = params + desc.offset_;
+        void* mem = allocKernArg(desc.size_, 128);
+        if (mem == nullptr) {
+          LogError("Out of memory");
+          return false;
+        }
+        memcpy(mem, srcArgPtr, desc.size_);
+        const auto it = hsaKernel.patch().find(desc.offset_);
+        WriteAqlArgAt(const_cast<address>(params), &mem, sizeof(void*), it->second);
+      }
+    }
+    else if (desc.type_ == T_SAMPLER) {
+      uint32_t index = desc.info_.arrayIndex_;
+      const amd::Sampler* sampler = reinterpret_cast<amd::Sampler* const*>(params +
+        kernelParams.samplerObjOffset())[index];
+
+      hsa_ext_sampler_descriptor_t samplerDescriptor;
+      fillSampleDescriptor(samplerDescriptor, *sampler);
+
+      hsa_ext_sampler_t hsa_sampler;
+      hsa_status_t status =
+        hsa_ext_sampler_create(dev().getBackendDevice(), &samplerDescriptor, &hsa_sampler);
+
+      if (status != HSA_STATUS_SUCCESS) {
+        // Wait on a kernel if one is outstanding
+        releaseGpuMemoryFence();
+        // Release the sampler handles allocated for the various
+        // on one or more kernel submissions
+        for (const auto& it: samplerList_) {
+          if (hsa_ext_sampler_destroy(gpu_device_, it) != HSA_STATUS_SUCCESS) {
+              LogWarning("Error destroying device sampler object!");
+          }
+        }
+
+        samplerList_.clear();
+        status = hsa_ext_sampler_create(dev().getBackendDevice(), &samplerDescriptor, &hsa_sampler);
+        if (status != HSA_STATUS_SUCCESS) {
+          LogError("Error creating device sampler object!");
+          return false;
+        }
+      }
+
+      uint64_t sampler_srd = hsa_sampler.handle;
+      WriteAqlArgAt(const_cast<address>(params), &sampler_srd, sizeof(sampler_srd), desc.offset_);
+      samplerList_.push_back(hsa_sampler);
+      // TODO: destroy sampler.
+    }
   }
 
   if (hsaKernel.program()->hasGlobalStores()) {
@@ -1438,26 +1570,6 @@ void VirtualGPU::submitMigrateMemObjects(amd::MigrateMemObjectsCommand& vcmd) {
   profilingEnd(vcmd);
 }
 
-/*! \brief Writes to the buffer and increments the write pointer to the
- *         buffer. Also, ensures that the argument is written to an
- *         aligned memory as specified. Return the new write pointer.
- *
- * @param dst The write pointer to the buffer
- * @param src The source pointer
- * @param size The size in bytes to copy
- * @param alignment The alignment to follow while writing to the buffer
- */
-static inline address addArg(address dst, const void* src, size_t size, uint32_t alignment) {
-  dst = amd::alignUp(dst, alignment);
-  ::memcpy(dst, src, size);
-  return dst + size;
-}
-
-static inline address addArg(address dst, const void* src, size_t size) {
-  assert(size < UINT32_MAX);
-  return addArg(dst, src, size, size);
-}
-
 // Over rides the workgroup size fields in the packet with runtime/compiler set sizes
 void setRuntimeCompilerLocalSize(hsa_kernel_dispatch_packet_t& dispatchPacket,
                                  amd::NDRangeContainer sizes, device::Kernel* devKernel,
@@ -1584,35 +1696,6 @@ void setRuntimeCompilerLocalSize(hsa_kernel_dispatch_packet_t& dispatchPacket,
   }
 }
 
-static void fillSampleDescriptor(hsa_ext_sampler_descriptor_t& samplerDescriptor,
-                                 const amd::Sampler& sampler) {
-  samplerDescriptor.filter_mode = sampler.filterMode() == CL_FILTER_NEAREST
-      ? HSA_EXT_SAMPLER_FILTER_MODE_NEAREST
-      : HSA_EXT_SAMPLER_FILTER_MODE_LINEAR;
-  samplerDescriptor.coordinate_mode = sampler.normalizedCoords()
-      ? HSA_EXT_SAMPLER_COORDINATE_MODE_NORMALIZED
-      : HSA_EXT_SAMPLER_COORDINATE_MODE_UNNORMALIZED;
-  switch (sampler.addressingMode()) {
-    case CL_ADDRESS_CLAMP_TO_EDGE:
-      samplerDescriptor.address_mode = HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE;
-      break;
-    case CL_ADDRESS_REPEAT:
-      samplerDescriptor.address_mode = HSA_EXT_SAMPLER_ADDRESSING_MODE_REPEAT;
-      break;
-    case CL_ADDRESS_CLAMP:
-      samplerDescriptor.address_mode = HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_BORDER;
-      break;
-    case CL_ADDRESS_MIRRORED_REPEAT:
-      samplerDescriptor.address_mode = HSA_EXT_SAMPLER_ADDRESSING_MODE_MIRRORED_REPEAT;
-      break;
-    case CL_ADDRESS_NONE:
-      samplerDescriptor.address_mode = HSA_EXT_SAMPLER_ADDRESSING_MODE_UNDEFINED;
-      break;
-    default:
-      return;
-  }
-}
-
 bool VirtualGPU::createSchedulerParam()
 {
   if (nullptr != schedulerParam_) {
@@ -1797,12 +1880,10 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
                                       const_address parameters, void* eventHandle) {
   device::Kernel* devKernel = const_cast<device::Kernel*>(kernel.getDeviceKernel(dev()));
   Kernel& gpuKernel = static_cast<Kernel&>(*devKernel);
-
-  const size_t compilerLdsUsage = gpuKernel.WorkgroupGroupSegmentByteSize();
-  size_t ldsUsage = compilerLdsUsage;
+  size_t ldsUsage = gpuKernel.WorkgroupGroupSegmentByteSize();
 
   // Check memory dependency and SVM objects
-  if (!processMemObjects(kernel, parameters)) {
+  if (!processMemObjects(kernel, parameters, ldsUsage)) {
     LogError("Wrong memory objects!");
     return false;
   }
@@ -1868,58 +1949,46 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
       return false;
     }
 
-    address argPtr = argBuffer;
-    for (auto arg : gpuKernel.hsailArgs()) {
-      const_address srcArgPtr = nullptr;
-      if (arg->index_ != uint(-1)) {
-        srcArgPtr = parameters + signature.at(arg->index_).offset_;
-      }
-
-      // Handle the hidden arguments first, as they do not have a
-      // matching parameter in the OCL signature (not a valid arg->index_)
-      switch (arg->type_) {
-        case ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_X: {
-          size_t offset_x = sizes.dimensions() >= 1 ? newOffset[0] : 0;
-          assert(arg->size_ == sizeof(offset_x) && "check the sizes");
-          argPtr = addArg(argPtr, &offset_x, arg->size_, arg->alignment_);
+    // Check if runtime has to setup hidden arguments
+    for (uint32_t i = signature.numParameters(); i < signature.numParametersAll(); ++i) {
+      const auto it = signature.at(i);
+      size_t offset;
+      switch (it.info_.oclObject_) {
+        case amd::KernelParameterDescriptor::HiddenNone:
+          break;
+        case amd::KernelParameterDescriptor::HiddenGlobalOffsetX: {
+          offset = newOffset[0];
+          assert(it.size_ == sizeof(offset) && "check the sizes");
+          WriteAqlArgAt(const_cast<address>(parameters), &offset, it.size_, it.offset_);
           break;
         }
-        case ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Y: {
-          size_t offset_y = sizes.dimensions() >= 2 ? newOffset[1] : 0;
-          assert(arg->size_ == sizeof(offset_y) && "check the sizes");
-          argPtr = addArg(argPtr, &offset_y, arg->size_, arg->alignment_);
+        case amd::KernelParameterDescriptor::HiddenGlobalOffsetY: {
+          if (sizes.dimensions() >= 2) {
+            offset = newOffset[1];
+            assert(it.size_ == sizeof(offset) && "check the sizes");
+            WriteAqlArgAt(const_cast<address>(parameters), &offset, it.size_, it.offset_);
+          }
           break;
         }
-        case ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Z: {
-          size_t offset_z = sizes.dimensions() == 3 ? newOffset[2] : 0;
-          assert(arg->size_ == sizeof(offset_z) && "check the sizes");
-          argPtr = addArg(argPtr, &offset_z, arg->size_, arg->alignment_);
+        case amd::KernelParameterDescriptor::HiddenGlobalOffsetZ: {
+          if (sizes.dimensions() >= 3) {
+            offset = newOffset[2];
+            assert(it.size_ == sizeof(offset) && "check the sizes");
+            WriteAqlArgAt(const_cast<address>(parameters), &offset, it.size_, it.offset_);
+          }
           break;
         }
-        case ROC_ARGTYPE_HIDDEN_PRINTF_BUFFER: {
+        case amd::KernelParameterDescriptor::HiddenPrintfBuffer: {
           address bufferPtr = printfDbg()->dbgBuffer();
-          assert(arg->size_ == sizeof(bufferPtr) && "check the sizes");
-          argPtr = addArg(argPtr, &bufferPtr, arg->size_, arg->alignment_);
+          if (printfEnabled &&
+            // and printf buffer was allocated
+            (bufferPtr != nullptr)) {
+            assert(it.size_ == sizeof(bufferPtr) && "check the sizes");
+            WriteAqlArgAt(const_cast<address>(parameters), &bufferPtr, it.size_, it.offset_);
+          }
           break;
         }
-        case ROC_ARGTYPE_QUEUE: {
-          uint32_t index = signature.at(arg->index_).info_.arrayIndex_;
-          const amd::DeviceQueue* queue = reinterpret_cast<amd::DeviceQueue* const*>(parameters +
-            kernelParams.samplerObjOffset())[index];
-          if (queue == nullptr) {
-            return false;
-          }
-
-          if (!createVirtualQueue(queue->size()) || !createSchedulerParam()) {
-            return false;
-          }
-          gpuKernel.setDynamicParallelFlag(true);
-          uint64_t vqVA = getVQVirtualAddress();
-          argPtr = addArg(argPtr, &vqVA, arg->size_, arg->alignment_);
-          break;
-        }
-        case ROC_ARGTYPE_HIDDEN_DEFAULT_QUEUE: {
-
+        case amd::KernelParameterDescriptor::HiddenDefaultQueue: {
           amd::DeviceQueue* defQueue = kernel.program().context().defDeviceQueue(dev());
 
           if (!createVirtualQueue(defQueue->size()) || !createSchedulerParam()) {
@@ -1927,156 +1996,28 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
           }
           gpuKernel.setDynamicParallelFlag(true);
           uint64_t vqVA = getVQVirtualAddress();
-          argPtr = addArg(argPtr, &vqVA, arg->size_, arg->alignment_);
+          WriteAqlArgAt(const_cast<address>(parameters), &vqVA, it.size_, it.offset_);
           break;
         }
-        case ROC_ARGTYPE_HIDDEN_COMPLETION_ACTION: {
-
+        case amd::KernelParameterDescriptor::HiddenCompletionAction: {
           Memory* schedulerMem = dev().getRocMemory(schedulerParam_);
           AmdAqlWrap* wrap = reinterpret_cast<AmdAqlWrap*>(reinterpret_cast<uint64_t>(schedulerParam_->getHostMem()) + sizeof(SchedulerParam));
           memset(wrap, 0, sizeof(AmdAqlWrap));
           wrap->state = AQL_WRAP_DONE;
 
           uint64_t spVA = reinterpret_cast<uint64_t>(schedulerMem->getDeviceMemory()) + sizeof(SchedulerParam);
-          argPtr = addArg(argPtr, &spVA, arg->size_, arg->alignment_);
+          WriteAqlArgAt(const_cast<address>(parameters), &spVA, it.size_, it.offset_);
           break;
         }
-        case ROC_ARGTYPE_HIDDEN_NONE: {
-          void* zero = 0;
-          assert(arg->size_ <= sizeof(zero) && "check the sizes");
-          argPtr = addArg(argPtr, &zero, arg->size_, arg->alignment_);
-          break;
-        }
-        case ROC_ARGTYPE_POINTER: {
-          if (arg->addrQual_ == ROC_ADDRESS_LOCAL) {
-            // Align the LDS on the alignment requirement of type pointed to
-            ldsUsage = amd::alignUp(ldsUsage, arg->pointeeAlignment_);
-            argPtr = addArg(argPtr, &ldsUsage, arg->size_, arg->alignment_);
-            if (sizeof(uint64_t) == arg->size_) {
-              ldsUsage += *reinterpret_cast<const uint64_t*>(srcArgPtr);
-            } else {
-              ldsUsage += *reinterpret_cast<const uint32_t*>(srcArgPtr);
-            }
-            break;
-          }
-          assert((arg->addrQual_ == ROC_ADDRESS_GLOBAL || arg->addrQual_ == ROC_ADDRESS_CONSTANT) &&
-                 "Unsupported address qualifier");
-          argPtr = addArg(argPtr, srcArgPtr, arg->size_, arg->alignment_);
-          uint32_t index = signature.at(arg->index_).info_.arrayIndex_;
-          amd::Memory* mem = memories[index];
-          if (mem == nullptr) {
-            break;
-          }
-
-          const bool readOnly =
-#if defined(WITH_LIGHTNING_COMPILER)
-              signature.at(arg->index_).typeQualifier_ == CL_KERNEL_ARG_TYPE_CONST ||
-#endif // defined(WITH_LIGHTNING_COMPILER)
-              (mem->getMemFlags() & CL_MEM_READ_ONLY) != 0;
-
-          if (!readOnly) {
-            mem->signalWrite(&dev());
-          }
-          break;
-        }
-        case ROC_ARGTYPE_REFERENCE: {
-          void* mem = allocKernArg(arg->size_, arg->alignment_);
-          if (mem == nullptr) {
-            LogError("Out of memory");
-            return false;
-          }
-          memcpy(mem, srcArgPtr, arg->size_);
-          argPtr = addArg(argPtr, &mem, sizeof(void*));
-          break;
-        }
-        case ROC_ARGTYPE_VALUE:
-          argPtr = addArg(argPtr, srcArgPtr, arg->size_, arg->alignment_);
-          break;
-        case ROC_ARGTYPE_IMAGE: {
-          uint32_t index = signature.at(arg->index_).info_.arrayIndex_;
-          amd::Memory* mem = memories[index];
-          Image* image = static_cast<Image*>(mem->getDeviceMemory(dev()));
-          if (image == nullptr) {
-            LogError("Kernel image argument is not an image object");
-            return false;
-          }
-
-          if (dev().settings().enableImageHandle_) {
-            const uint64_t image_srd = image->getHsaImageObject().handle;
-            assert(amd::isMultipleOf(image_srd, sizeof(image_srd)));
-            argPtr = addArg(argPtr, &image_srd, sizeof(image_srd));
-          } else {
-            // Image arguments are of size 48 bytes and are aligned to 16 bytes
-            argPtr = addArg(argPtr, (void*)image->getHsaImageObject().handle, HSA_IMAGE_OBJECT_SIZE,
-                            HSA_IMAGE_OBJECT_ALIGNMENT);
-          }
-
-          const bool readOnly =
-#if defined(WITH_LIGHTNING_COMPILER)
-              signature.at(arg->index_).accessQualifier_ == CL_KERNEL_ARG_ACCESS_READ_ONLY ||
-#endif // defined(WITH_LIGHTNING_COMPILER)
-              mem->getMemFlags() & CL_MEM_READ_ONLY;
-
-          if (!readOnly) {
-            mem->signalWrite(&dev());
-          }
-          break;
-        }
-        case ROC_ARGTYPE_SAMPLER: {
-          uint32_t index = signature.at(arg->index_).info_.arrayIndex_;
-          const amd::Sampler* sampler = reinterpret_cast<amd::Sampler* const*>(parameters +
-            kernelParams.samplerObjOffset())[index];
-          if (sampler == nullptr) {
-            LogError("Kernel sampler argument is not an sampler object");
-            return false;
-          }
-
-          hsa_ext_sampler_descriptor_t samplerDescriptor;
-          fillSampleDescriptor(samplerDescriptor, *sampler);
-
-          hsa_ext_sampler_t hsa_sampler;
-          hsa_status_t status =
-              hsa_ext_sampler_create(dev().getBackendDevice(), &samplerDescriptor, &hsa_sampler);
-          if (status != HSA_STATUS_SUCCESS) {
-            // Wait on a kernel if one is outstanding
-            releaseGpuMemoryFence();
-            // Release the sampler handles allocated for the various
-            // on one or more kernel submissions
-            for (const auto& it: samplerList_) {
-              if (hsa_ext_sampler_destroy(gpu_device_, it) != HSA_STATUS_SUCCESS) {
-                LogWarning("Error destroying device sampler object!");
-              }
-            }
-            samplerList_.clear();
-
-            status = hsa_ext_sampler_create(dev().getBackendDevice(), &samplerDescriptor, &hsa_sampler);
-            if (status != HSA_STATUS_SUCCESS) {
-              LogError("Error creating device sampler object!");
-              return false;
-            }
-          }
-
-          if (dev().settings().enableImageHandle_) {
-            uint64_t sampler_srd = hsa_sampler.handle;
-            argPtr = addArg(argPtr, &sampler_srd, sizeof(sampler_srd));
-            samplerList_.push_back(hsa_sampler);
-            // TODO: destroy sampler.
-          } else {
-            argPtr = amd::alignUp(argPtr, HSA_SAMPLER_OBJECT_ALIGNMENT);
-
-            memcpy(argPtr, (void*)hsa_sampler.handle, HSA_SAMPLER_OBJECT_SIZE);
-            argPtr += HSA_SAMPLER_OBJECT_SIZE;
-            hsa_ext_sampler_destroy(dev().getBackendDevice(), hsa_sampler);
-          }
-          break;
-        }
-        default:
-          return false;
       }
     }
 
-    // Check there is no arguments' buffer overflow
-    assert(argPtr <= argBuffer + gpuKernel.KernargSegmentByteSize());
+    // Load all kernel arguments
+    WriteAqlArgAt(argBuffer, parameters, gpuKernel.KernargSegmentByteSize(), 0);
+    // Note: In a case of structs the size won't match,
+    // since HSAIL compiler expects a reference...
+    assert(gpuKernel.KernargSegmentByteSize() <= signature.paramsSize() &&
+      "A mismatch of sizes of arguments between compiler and runtime!");
 
     // Check for group memory overflow
     //! @todo Check should be in HSA - here we should have at most an assert
diff --git a/rocclr/runtime/device/rocm/rocvirtual.hpp b/rocclr/runtime/device/rocm/rocvirtual.hpp
index 520cc9f515..40758f8fd5 100644
--- a/rocclr/runtime/device/rocm/rocvirtual.hpp
+++ b/rocclr/runtime/device/rocm/rocvirtual.hpp
@@ -217,7 +217,8 @@ class VirtualGPU : public device::VirtualDevice {
 
   //! Detects memory dependency for HSAIL kernels and uses appropriate AQL header
   bool processMemObjects(const amd::Kernel& kernel,  //!< AMD kernel object for execution
-                         const_address params        //!< Pointer to the param's store
+                         const_address params,       //!< Pointer to the param's store
+			 size_t& ldsAddress          //!< LDS usage
                          );
   // Retun the virtual gpu unique index
   uint index() const { return index_; }
@@ -313,4 +314,34 @@ class VirtualGPU : public device::VirtualDevice {
   };
 
 };
+
+template <typename T>
+inline void WriteAqlArgAt(
+  unsigned char* dst,   //!< The write pointer to the buffer
+  const T* src,         //!< The source pointer
+  uint size,            //!< The size in bytes to copy
+  size_t offset         //!< The alignment to follow while writing to the buffer
+) {
+  memcpy(dst + offset, src, size);
+}
+
+template <>
+inline void WriteAqlArgAt(
+  unsigned char* dst,   //!< The write pointer to the buffer
+  const uint32_t* src,  //!< The source pointer
+  uint size,            //!< The size in bytes to copy
+  size_t offset         //!< The alignment to follow while writing to the buffer
+) {
+  *(reinterpret_cast<uint32_t*>(dst + offset)) = *src;
+}
+
+template <>
+inline void WriteAqlArgAt(
+  unsigned char* dst,   //!< The write pointer to the buffer
+  const uint64_t* src,  //!< The source pointer
+  uint size,            //!< The size in bytes to copy
+  size_t offset         //!< The alignment to follow while writing to the buffer
+) {
+  *(reinterpret_cast<uint64_t*>(dst + offset)) = *src;
+}
 }