From 665aab7ca4e7699a9926dbb2298ec622e17b385a Mon Sep 17 00:00:00 2001
From: foreman <dl.constructicon@amd.com>
Date: Tue, 12 Jun 2018 18:57:20 -0400
Subject: [PATCH] P4 to Git Change 1567428 by gandryey@gera-w8 on 2018/06/12
 18:39:23

	SWDEV-79445 - OCL generic changes and code clean-up
	- Optimize setup of kernel arguments. Stage 2.
	- Add HW ABI support in the abstraction layer
	- Remove arguments parsing loop from the kernel launch. Memory processing will be responsible for dependency tracking and  patching of arguments.

	http://ocltc.amd.com/reviews/r/15122/

Affected files ...

... //depot/stg/opencl/drivers/opencl/runtime/device/device.cpp#221 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/device.hpp#307 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpukernel.cpp#325 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palblit.cpp#24 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palkernel.cpp#53 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palkernel.hpp#17 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palmemory.hpp#9 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.cpp#107 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.hpp#53 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rockernel.cpp#36 edit
... //depot/stg/opencl/drivers/opencl/runtime/platform/kernel.cpp#30 edit
... //depot/stg/opencl/drivers/opencl/runtime/platform/kernel.hpp#23 edit
... //depot/stg/opencl/drivers/opencl/runtime/platform/program.cpp#95 edit


[ROCm/clr commit: 1be400ff014502df6628e71661894460d18ae458]
---
 projects/clr/rocclr/runtime/device/device.cpp |   6 +-
 projects/clr/rocclr/runtime/device/device.hpp |  22 +-
 .../rocclr/runtime/device/gpu/gpukernel.cpp   |   9 +-
 .../clr/rocclr/runtime/device/pal/palblit.cpp |  14 +-
 .../rocclr/runtime/device/pal/palkernel.cpp   | 406 +++++++-----------
 .../rocclr/runtime/device/pal/palkernel.hpp   |   5 +-
 .../rocclr/runtime/device/pal/palmemory.hpp   |   2 +
 .../rocclr/runtime/device/pal/palvirtual.cpp  | 197 ++++++---
 .../rocclr/runtime/device/pal/palvirtual.hpp  |  35 +-
 .../rocclr/runtime/device/rocm/rockernel.cpp  |   7 +-
 .../clr/rocclr/runtime/platform/kernel.cpp    |  18 +-
 .../clr/rocclr/runtime/platform/kernel.hpp    |  25 +-
 .../clr/rocclr/runtime/platform/program.cpp   |   4 +-
 13 files changed, 430 insertions(+), 320 deletions(-)

diff --git a/projects/clr/rocclr/runtime/device/device.cpp b/projects/clr/rocclr/runtime/device/device.cpp
index 0be2916483..718afc078a 100644
--- a/projects/clr/rocclr/runtime/device/device.cpp
+++ b/projects/clr/rocclr/runtime/device/device.cpp
@@ -600,7 +600,9 @@ Settings::Settings() {
                          //!< concurrent Virtual GPUs for default
 }
 
-bool Kernel::createSignature(const parameters_t& params) {
+bool Kernel::createSignature(
+  const parameters_t& params, const parameters_t& hiddenParams,
+  uint32_t version) {
   std::stringstream attribs;
   if (workGroupInfo_.compileSize_[0] != 0) {
     attribs << "reqd_work_group_size(";
@@ -632,7 +634,7 @@ bool Kernel::createSignature(const parameters_t& params) {
   // Destroy old signature if it was allocated before
   // (offline devices path)
   delete signature_;
-  signature_ = new amd::KernelSignature(params, attribs.str());
+  signature_ = new amd::KernelSignature(params, attribs.str(), hiddenParams, version);
   if (NULL != signature_) {
     return true;
   }
diff --git a/projects/clr/rocclr/runtime/device/device.hpp b/projects/clr/rocclr/runtime/device/device.hpp
index 1b3d33f25a..4c32821207 100644
--- a/projects/clr/rocclr/runtime/device/device.hpp
+++ b/projects/clr/rocclr/runtime/device/device.hpp
@@ -852,7 +852,9 @@ class Kernel : public amd::HeapObject {
   const std::string& name() const { return name_; }
 
   //! Initializes the kernel parameters for the abstraction layer
-  bool createSignature(const parameters_t& params);
+  bool createSignature(
+    const parameters_t& params, const parameters_t& hiddenParams,
+    uint32_t version);
 
   //! Returns TRUE if it's a HSA kernel
   bool hsa() const { return hsa_; }
@@ -1624,6 +1626,22 @@ class Device : public RuntimeObject {
 };
 
 struct KernelParameterDescriptor {
+  enum {
+    Value = 0,
+    HiddenNone = 1,
+    HiddenGlobalOffsetX = 2,
+    HiddenGlobalOffsetY = 3,
+    HiddenGlobalOffsetZ = 4,
+    HiddenPrintfBuffer = 5,
+    HiddenDefaultQueue = 6,
+    HiddenCompletionAction = 7,
+    MemoryObject = 8,
+    ReferenceObject = 9,
+    ValueObject = 10,
+    ImageObject = 11,
+    SamplerObject = 12,
+    QueueObject = 13
+  };
   const char* name_;       //!< The parameter's name in the source
   clk_value_type_t type_;  //!< The parameter's type
   size_t offset_;          //!< Its offset in the parameter's stack
@@ -1642,7 +1660,7 @@ struct KernelParameterDescriptor {
       uint32_t rawPointer_ : 1;   //!< Arguments have a raw GPU VA
       uint32_t defined_    : 1;   //!< The argument was defined by the app
       uint32_t reserved_   : 1;   //!< reserved
-      uint32_t arrayIndex_ : 28;  //!< Index in the objects array
+      uint32_t arrayIndex_ : 24;  //!< Index in the objects array or LDS alignment
     };
     uint32_t allValues_;
     InfoData() : allValues_(0) {}
diff --git a/projects/clr/rocclr/runtime/device/gpu/gpukernel.cpp b/projects/clr/rocclr/runtime/device/gpu/gpukernel.cpp
index e49fe8b63b..09c911022e 100644
--- a/projects/clr/rocclr/runtime/device/gpu/gpukernel.cpp
+++ b/projects/clr/rocclr/runtime/device/gpu/gpukernel.cpp
@@ -752,7 +752,8 @@ bool NullKernel::create(const std::string& code, const std::string& metadata,
   workGroupInfo_.usedStackSize_ = calFuncInfo.stackSizeUsed;
 
   device::Kernel::parameters_t params;
-  if (!createSignature(params)) {
+  device::Kernel::parameters_t hiddenParams;
+  if (!createSignature(params, hiddenParams, amd::KernelSignature::ABIVersion_0)) {
     return false;
   }
 
@@ -1337,7 +1338,8 @@ bool Kernel::initParameters() {
     workGroupInfo_.localMemSize_ = hwLocalSize_;
   }
 
-  if (!createSignature(params)) {
+  device::Kernel::parameters_t hiddenParams;
+  if (!createSignature(params, hiddenParams, amd::KernelSignature::ABIVersion_0)) {
     return false;
   }
 
@@ -3017,7 +3019,8 @@ void HSAILKernel::initArgList(const aclArgData* aclArg) {
     }
   }
 
-  createSignature(params);
+  device::Kernel::parameters_t hiddenParams;
+  createSignature(params, hiddenParams, amd::KernelSignature::ABIVersion_0);
 }
 
 void HSAILKernel::initHsailArgs(const aclArgData* aclArg) {
diff --git a/projects/clr/rocclr/runtime/device/pal/palblit.cpp b/projects/clr/rocclr/runtime/device/pal/palblit.cpp
index b8845ac7f6..f64af9d2cf 100644
--- a/projects/clr/rocclr/runtime/device/pal/palblit.cpp
+++ b/projects/clr/rocclr/runtime/device/pal/palblit.cpp
@@ -943,24 +943,30 @@ static void setArgument(amd::Kernel* kernel, size_t index, size_t size, const vo
 
   uint32_t uint32_value = 0;
   uint64_t uint64_value = 0;
+  size_t argSize = desc.size_;
 
   if (desc.type_ == T_POINTER && desc.size_ != 0) {
     if ((value == NULL) || (static_cast<const cl_mem*>(value) == NULL)) {
-      LP64_SWITCH(uint32_value, uint64_value) = 0;
       reinterpret_cast<Memory**>(kernel->parameters().values() +
         kernel->parameters().memoryObjOffset())[desc.info_.arrayIndex_] = nullptr;
     } else {
       // convert cl_mem to amd::Memory*, return false if invalid.
       LP64_SWITCH(uint32_value, uint64_value) = static_cast<uintptr_t>((
-        *static_cast<Memory* const*>(value))->vmAddress());
+        *static_cast<Memory* const*>(value))->virtualAddress());
       reinterpret_cast<Memory**>(kernel->parameters().values() +
         kernel->parameters().memoryObjOffset())[desc.info_.arrayIndex_] =
         *static_cast<Memory* const*>(value);
+      // Note: Special case for image SRD, which is 64 bit always
+      if (LP64_SWITCH(true, false) &&
+          (desc.info_.oclObject_ == amd::KernelParameterDescriptor::ImageObject)) {
+        uint64_value = uint32_value;
+        argSize = sizeof(uint64_t);
+      }
     }
   } else if (desc.type_ == T_SAMPLER) {
     assert(false && "No sampler support in blit manager! Use internal samplers!");
   } else
-    switch (desc.size_) {
+    switch (argSize) {
       case 1:
         uint32_value = *static_cast<const uint8_t*>(value);
         break;
@@ -977,7 +983,7 @@ static void setArgument(amd::Kernel* kernel, size_t index, size_t size, const vo
         break;
     }
 
-  switch (desc.size_) {
+  switch (argSize) {
     case 0 /*local mem*/:
       *static_cast<size_t*>(param) = size;
       break;
diff --git a/projects/clr/rocclr/runtime/device/pal/palkernel.cpp b/projects/clr/rocclr/runtime/device/pal/palkernel.cpp
index c2a2be0c01..f181218413 100644
--- a/projects/clr/rocclr/runtime/device/pal/palkernel.cpp
+++ b/projects/clr/rocclr/runtime/device/pal/palkernel.cpp
@@ -228,6 +228,37 @@ inline static int GetHSAILArgSize(const aclArgData* argInfo) {
   }
 }
 
+inline static uint32_t GetOclArgumentType(const HSAILKernel::Argument* arg) {
+  switch (arg->type_){
+    case HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_X:
+      return amd::KernelParameterDescriptor::HiddenGlobalOffsetX;
+    case HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Y:
+      return amd::KernelParameterDescriptor::HiddenGlobalOffsetY;
+    case HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Z:
+      return amd::KernelParameterDescriptor::HiddenGlobalOffsetZ;
+    case HSAIL_ARGTYPE_HIDDEN_PRINTF_BUFFER:
+      return amd::KernelParameterDescriptor::HiddenPrintfBuffer;
+    case HSAIL_ARGTYPE_HIDDEN_DEFAULT_QUEUE:
+      return amd::KernelParameterDescriptor::HiddenDefaultQueue;
+    case HSAIL_ARGTYPE_HIDDEN_COMPLETION_ACTION:
+      return amd::KernelParameterDescriptor::HiddenCompletionAction;
+    case HSAIL_ARGTYPE_POINTER:
+        return amd::KernelParameterDescriptor::MemoryObject;
+    case HSAIL_ARGTYPE_IMAGE:
+        return amd::KernelParameterDescriptor::ImageObject;
+    case HSAIL_ARGTYPE_REFERENCE:
+        return amd::KernelParameterDescriptor::ReferenceObject;
+    case HSAIL_ARGTYPE_VALUE:
+        return amd::KernelParameterDescriptor::ValueObject;
+    case HSAIL_ARGTYPE_SAMPLER:
+        return amd::KernelParameterDescriptor::SamplerObject;
+    case HSAIL_ARGTYPE_QUEUE:
+        return amd::KernelParameterDescriptor::QueueObject;
+    default:
+      return amd::KernelParameterDescriptor::HiddenNone;
+  }
+}
+
 inline static clk_value_type_t GetOclType(const HSAILKernel::Argument* arg) {
   static const clk_value_type_t ClkValueMapType[6][6] = {
       {T_CHAR, T_CHAR2, T_CHAR3, T_CHAR4, T_CHAR8, T_CHAR16},
@@ -422,12 +453,22 @@ void HSAILKernel::initArgList(const aclArgData* aclArg) {
 
   // Iterate through the arguments and insert into parameterList
   device::Kernel::parameters_t params;
+  device::Kernel::parameters_t hiddenParams;
   amd::KernelParameterDescriptor desc;
   size_t offset = 0;
+  size_t offsetStruct = argsBufferSize();
 
   for (uint i = 0; aclArg->struct_size != 0; i++, aclArg++) {
-    // skip the hidden arguments
-    if (arguments_[i]->index_ == uint(-1)) continue;
+    // Allocate the hidden arguments, but abstraction layer will skip them
+    if (arguments_[i]->index_ == uint(-1)) {
+      offset = amd::alignUp(offset, arguments_[i]->alignment_);
+      desc.offset_ = offset;
+      desc.size_ = arguments_[i]->size_;
+      offset += arguments_[i]->size_;
+      desc.info_.oclObject_ = GetOclArgumentType(arguments_[i]);
+      hiddenParams.push_back(desc);
+      continue;
+    }
 
     desc.name_ = arguments_[i]->name_.c_str();
     desc.type_ = GetOclType(arguments_[i]);
@@ -435,6 +476,8 @@ void HSAILKernel::initArgList(const aclArgData* aclArg) {
     desc.accessQualifier_ = GetOclAccessQual(arguments_[i]);
     desc.typeQualifier_ = GetOclTypeQual(aclArg);
     desc.typeName_ = arguments_[i]->typeName_.c_str();
+    desc.info_.oclObject_ = GetOclArgumentType(arguments_[i]);
+    desc.info_.arrayIndex_ = arguments_[i]->pointeeAlignment_;
 
     // Make a check if it is local or global
     if (desc.addressQualifier_ == CL_KERNEL_ARG_ADDRESS_LOCAL) {
@@ -451,9 +494,32 @@ void HSAILKernel::initArgList(const aclArgData* aclArg) {
       // Local memory for CPU
       size = sizeof(cl_mem);
     }
-    offset = amd::alignUp(offset, std::min(size, size_t(16)));
-    desc.offset_ = offset;
-    offset += amd::alignUp(size, sizeof(uint32_t));
+    // Check if HSAIL expects data by reference and allocate it behind
+    if (arguments_[i]->type_ == HSAIL_ARGTYPE_REFERENCE) {
+      desc.offset_ = offsetStruct;
+      // Align the offset reference
+      offset = amd::alignUp(offset, sizeof(size_t));
+      patchReferences_.insert({desc.offset_, offset});
+      offsetStruct += size;
+      // Adjust the offset of arguments
+      offset += sizeof(size_t);
+    } else {
+      // These objects have forced data size to uint64_t
+      if ((desc.info_.oclObject_ == amd::KernelParameterDescriptor::ImageObject) ||
+          (desc.info_.oclObject_ == amd::KernelParameterDescriptor::SamplerObject) ||
+          (desc.info_.oclObject_ == amd::KernelParameterDescriptor::QueueObject)) {
+        offset = amd::alignUp(offset, sizeof(uint64_t));
+        desc.offset_ = offset;
+        offset += sizeof(uint64_t);
+      } else {
+        offset = amd::alignUp(offset, arguments_[i]->alignment_);
+        desc.offset_ = offset;
+        offset += size;
+      }
+    }
+    // Update read only flag
+    desc.info_.readOnly_ = (arguments_[i]->access_ == HSAIL_ACCESS_TYPE_RO) ? true : false;
+
     params.push_back(desc);
 
     if (arguments_[i]->type_ == HSAIL_ARGTYPE_IMAGE) {
@@ -464,7 +530,7 @@ void HSAILKernel::initArgList(const aclArgData* aclArg) {
     }
   }
 
-  createSignature(params);
+  createSignature(params, hiddenParams, amd::KernelSignature::ABIVersion_1);
 }
 
 void HSAILKernel::initHsailArgs(const aclArgData* aclArg) {
@@ -869,247 +935,79 @@ void HSAILKernel::findLocalWorkSize(size_t workDim, const amd::NDRange& gblWorkS
   }
 }
 
-template <typename T>
-inline void WriteAqlArg(
-    unsigned char** dst,  //!< The write pointer to the buffer
-    const T* src,         //!< The source pointer
-    uint size,            //!< The size in bytes to copy
-    uint alignment        //!< The alignment to follow while writing to the buffer
-) {
-  *dst = amd::alignUp(*dst, alignment);
-  memcpy(*dst, src, size);
-  *dst += size;
-}
-
-template <>
-inline void WriteAqlArg(
-    unsigned char** dst,  //!< The write pointer to the buffer
-    const uint32_t* src,  //!< The source pointer
-    uint size,            //!< The size in bytes to copy
-    uint alignment        //!< The alignment to follow while writing to the buffer
-) {
-  *dst = amd::alignUp(*dst, alignment);
-  *(reinterpret_cast<uint32_t*>(*dst)) = *src;
-  *dst += size;
-}
-
-template <>
-inline void WriteAqlArg(
-    unsigned char** dst,  //!< The write pointer to the buffer
-    const uint64_t* src,  //!< The source pointer
-    uint size,            //!< The size in bytes to copy
-    uint alignment        //!< The alignment to follow while writing to the buffer
-) {
-  *dst = amd::alignUp(*dst, alignment);
-  *(reinterpret_cast<uint64_t*>(*dst)) = *src;
-  *dst += size;
-}
-
-const uint16_t kDispatchPacketHeader = (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
-    (1 << HSA_PACKET_HEADER_BARRIER) |
-    (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
-    (HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
-
 hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments(
     VirtualGPU& gpu, const amd::Kernel& kernel, const amd::NDRangeContainer& sizes,
-    const_address parameters, bool nativeMem, uint64_t vmDefQueue, uint64_t* vmParentWrap) const {
-  static const bool WaitOnBusyEngine = true;
-  uint64_t ldsAddress = ldsSize();
-  address aqlArgBuf = gpu.cb(0)->SysMemCopy();
-  bool srdResource = false;
+    const_address parameters, size_t ldsAddress, uint64_t vmDefQueue, uint64_t* vmParentWrap) const {
+  uint64_t argList;
+  address aqlArgBuf = gpu.managedBuffer().reserve(
+    argsBufferSize() + sizeof(hsa_kernel_dispatch_packet_t), &argList);
+  gpu.addVmMemory(gpu.managedBuffer().activeMemory());
 
   if (dynamicParallelism()) {
     // Provide the host parent AQL wrap object to the kernel
     AmdAqlWrap wrap = {};
     wrap.state = AQL_WRAP_BUSY;
-    const ConstantBuffer* cb = gpu.cb(1);
-    *vmParentWrap = cb->UploadDataToHw(&wrap, sizeof(AmdAqlWrap));
-    gpu.addVmMemory(cb->ActiveMemory());
+    *vmParentWrap = gpu.cb(1)->UploadDataToHw(&wrap, sizeof(AmdAqlWrap));
+    gpu.addVmMemory(gpu.cb(1)->ActiveMemory());
   }
 
   const amd::KernelSignature& signature = kernel.signature();
-  const amd::KernelParameters& kernelParams = kernel.parameters();
-  amd::Memory* const* memories =
-    reinterpret_cast<amd::Memory* const*>(parameters + kernelParams.memoryObjOffset());
 
-  // Find all parameters for the current kernel
-  for (auto arg : arguments_) {
-    const_address paramaddr = nullptr;
-    if (arg->index_ != uint(-1)) {
-      paramaddr = parameters + signature.at(arg->index_).offset_;
-    }
-
-    // Handle the hidden arguments first, as they do not have a
-    // matching parameter in the OCL signature (not a valid arg->index_)
-    switch (arg->type_) {
-      case HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_X: {
-        size_t offset_x = sizes.dimensions() >= 1 ? sizes.offset()[0] : 0;
-        assert(arg->size_ == sizeof(offset_x) && "check the sizes");
-        WriteAqlArg(&aqlArgBuf, &offset_x, arg->size_, arg->alignment_);
+  // Check if runtime has to setup hidden arguments
+  for (const auto& it : signature.hiddenParameters()) {
+    size_t offset;
+    switch (it.info_.oclObject_) {
+      case amd::KernelParameterDescriptor::HiddenNone:
+        //WriteAqlArgAt(aqlArgBuf, &zero, it.size_, it.offset_);
         break;
-      }
-      case HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Y: {
-        size_t offset_y = sizes.dimensions() >= 2 ? sizes.offset()[1] : 0;
-        assert(arg->size_ == sizeof(offset_y) && "check the sizes");
-        WriteAqlArg(&aqlArgBuf, &offset_y, arg->size_, arg->alignment_);
+      case amd::KernelParameterDescriptor::HiddenGlobalOffsetX:
+        offset = sizes.offset()[0];
+        WriteAqlArgAt(const_cast<address>(parameters), &offset, it.size_, it.offset_);
         break;
-      }
-      case HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Z: {
-        size_t offset_z = sizes.dimensions() == 3 ? sizes.offset()[2] : 0;
-        assert(arg->size_ == sizeof(offset_z) && "check the sizes");
-        WriteAqlArg(&aqlArgBuf, &offset_z, arg->size_, arg->alignment_);
+      case amd::KernelParameterDescriptor::HiddenGlobalOffsetY:
+        if (sizes.dimensions() >= 2) {
+            offset = sizes.offset()[1];
+            WriteAqlArgAt(const_cast<address>(parameters), &offset, it.size_, it.offset_);
+        }
         break;
-      }
-      case HSAIL_ARGTYPE_HIDDEN_PRINTF_BUFFER: {
-        size_t bufferPtr = 0;
+      case amd::KernelParameterDescriptor::HiddenGlobalOffsetZ:
+        if (sizes.dimensions() >= 3) {
+          offset = sizes.offset()[2];
+          WriteAqlArgAt(const_cast<address>(parameters), &offset, it.size_, it.offset_);
+        }
+        break;
+      case amd::KernelParameterDescriptor::HiddenPrintfBuffer:
         if ((printfInfo().size() > 0) &&
             // and printf buffer was allocated
             (gpu.printfDbgHSA().dbgBuffer() != nullptr)) {
           // and set the fourth argument as the printf_buffer pointer
-          bufferPtr = static_cast<size_t>(gpu.printfDbgHSA().dbgBuffer()->vmAddress());
+          size_t bufferPtr = static_cast<size_t>(gpu.printfDbgHSA().
+            dbgBuffer()->vmAddress());
           gpu.addVmMemory(gpu.printfDbgHSA().dbgBuffer());
-        }
-        assert(arg->size_ == sizeof(bufferPtr) && "check the sizes");
-        WriteAqlArg(&aqlArgBuf, &bufferPtr, arg->size_, arg->alignment_);
-        break;
-      }
-      case HSAIL_ARGTYPE_HIDDEN_DEFAULT_QUEUE:
-        assert(arg->size_ == sizeof(static_cast<size_t>(vmDefQueue)) && "check the sizes");
-        WriteAqlArg(&aqlArgBuf, &vmDefQueue, arg->size_, arg->alignment_);
-        break;
-      case HSAIL_ARGTYPE_HIDDEN_COMPLETION_ACTION:
-        assert(arg->size_ == sizeof(static_cast<size_t>(*vmParentWrap)) && "check the sizes");
-        WriteAqlArg(&aqlArgBuf, vmParentWrap, arg->size_, arg->alignment_);
-        break;
-      case HSAIL_ARGTYPE_HIDDEN_NONE: {
-        void* zero = 0;
-        assert(arg->size_ <= sizeof(zero) && "check the sizes");
-        WriteAqlArg(&aqlArgBuf, &zero, arg->size_, arg->alignment_);
-        break;
-      }
-      case HSAIL_ARGTYPE_POINTER: {
-        // If it is a local pointer
-        if (arg->addrQual_ == HSAIL_ADDRESS_LOCAL) {
-          ldsAddress = amd::alignUp(ldsAddress, arg->pointeeAlignment_);
-          WriteAqlArg(&aqlArgBuf, &ldsAddress, arg->size_, arg->alignment_);
-          ldsAddress += *reinterpret_cast<const size_t*>(paramaddr);
-          break;
-        }
-        assert(
-            (arg->addrQual_ == HSAIL_ADDRESS_GLOBAL || arg->addrQual_ == HSAIL_ADDRESS_CONSTANT) &&
-            "Unsupported address qualifier");
-        WriteAqlArg(&aqlArgBuf, paramaddr, sizeof(paramaddr), sizeof(paramaddr));
-        break;
-      }
-      case HSAIL_ARGTYPE_REFERENCE: {
-        const ConstantBuffer* cb = gpu.cb(1);
-        // Copy the current structure into CB1
-        size_t gpuPtr = static_cast<size_t>(cb->UploadDataToHw(paramaddr, arg->size_));
-        // Then use a pointer in aqlArgBuffer to CB1
-        WriteAqlArg(&aqlArgBuf, &gpuPtr, sizeof(size_t), sizeof(size_t));
-        gpu.addVmMemory(cb->ActiveMemory());
-        break;
-      }
-      case HSAIL_ARGTYPE_VALUE:
-        if (arg->size_ == sizeof(uint32_t)) {
-          WriteAqlArg(&aqlArgBuf, reinterpret_cast<const uint32_t*>(paramaddr),
-            sizeof(uint32_t), arg->alignment_);
-        } else if (arg->size_ == sizeof(uint64_t)) {
-          WriteAqlArg(&aqlArgBuf, reinterpret_cast<const uint64_t*>(paramaddr),
-            sizeof(uint64_t), arg->alignment_);
-        } else {
-          WriteAqlArg(&aqlArgBuf, paramaddr, arg->size_, arg->alignment_);
+          WriteAqlArgAt(const_cast<address>(parameters), &bufferPtr, it.size_, it.offset_);
         }
         break;
-      case HSAIL_ARGTYPE_IMAGE: {
-        Image* image = nullptr;
-        amd::Memory* mem = nullptr;
-        uint32_t index = signature.at(arg->index_).info_.arrayIndex_;
-        if (nativeMem) {
-          image = reinterpret_cast<Image* const*>(memories)[index];
-          if (nullptr != image) {
-            mem = image->owner();
-          }
-        } else {
-          mem = memories[index];
-          if (mem != nullptr) {
-            image = static_cast<Image*>(dev().getGpuMemory(mem));
-          }
-        }
-
-        //! \note Special case for the image views.
-        //! Copy SRD to CB1, so blit manager will be able to release
-        //! this view without a wait for SRD resource.
-        if (image->memoryType() == Resource::ImageView) {
-          // Copy the current image SRD into CB1
-          const ConstantBuffer* cb = gpu.cb(1);
-          uint64_t srd = cb->UploadDataToHw(image->hwState(), HsaImageObjectSize);
-          // Then use a pointer in aqlArgBuffer to CB1
-          WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd), sizeof(srd));
-          gpu.addVmMemory(cb->ActiveMemory());
-        } else {
-          uint64_t srd = image->hwSrd();
-          WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd), sizeof(srd));
-          srdResource = true;
-        }
-
-        if (image->desc().isDoppTexture_) {
-          gpu.addDoppRef(image, kernel.parameters().getExecNewVcop(),
-            kernel.parameters().getExecPfpaVcop());
+      case amd::KernelParameterDescriptor::HiddenDefaultQueue:
+        if (vmDefQueue != 0) {
+          WriteAqlArgAt(const_cast<address>(parameters), &vmDefQueue, it.size_, it.offset_);
         }
         break;
-      }
-      case HSAIL_ARGTYPE_SAMPLER: {
-        uint32_t index = signature.at(arg->index_).info_.arrayIndex_;
-        const amd::Sampler* sampler = reinterpret_cast<amd::Sampler* const*>(parameters +
-            kernelParams.samplerObjOffset())[index];
-        const Sampler* gpuSampler = static_cast<Sampler*>(sampler->getDeviceSampler(dev()));
-        uint64_t srd = gpuSampler->hwSrd();
-        WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd), sizeof(srd));
-        srdResource = true;
-        break;
-      }
-      case HSAIL_ARGTYPE_QUEUE: {
-        uint32_t index = signature.at(arg->index_).info_.arrayIndex_;
-        const amd::DeviceQueue* queue = reinterpret_cast<amd::DeviceQueue* const*>(
-          parameters + kernelParams.queueObjOffset())[index];
-        VirtualGPU* gpuQueue = static_cast<VirtualGPU*>(queue->vDev());
-        uint64_t vmQueue;
-        if (dev().settings().useDeviceQueue_) {
-          vmQueue = gpuQueue->vQueue()->vmAddress();
-        } else {
-          if (!gpu.createVirtualQueue(queue->size())) {
-            LogError("Virtual queue creation failed!");
-            return nullptr;
-          }
-          vmQueue = gpu.vQueue()->vmAddress();
+      case amd::KernelParameterDescriptor::HiddenCompletionAction:
+        if (*vmParentWrap != 0) {
+          WriteAqlArgAt(const_cast<address>(parameters), vmParentWrap, it.size_, it.offset_);
         }
-        WriteAqlArg(&aqlArgBuf, &vmQueue, sizeof(vmQueue), sizeof(vmQueue));
         break;
-      }
-      default:
-        LogError(" Unsupported argument type ");
-        return nullptr;
     }
   }
 
-  if (ldsAddress > dev().info().localMemSize_) {
-    LogError("No local memory available\n");
-    return nullptr;
-  }
+  // Load all kernel arguments
+  WriteAqlArgAt(aqlArgBuf, parameters, signature.paramsSize(), 0);
+  assert(argsBufferSize() == amd::alignUp(signature.paramsSize(), 16) &&
+    "A mismatch of sizes of arguments between compiler and runtime!");
 
-#if defined(WITH_LIGHTNING_COMPILER)
-  // Check there is no arguments' buffer overflow. We may not use all the
-  // hidden argument slots.
-  assert(aqlArgBuf <= (gpu.cb(0)->SysMemCopy() + argsBufferSize()));
-#else   // !defined(WITH_LIGHTNING_COMPILER)
-  // HSAIL kernarg segment size is rounded up to multiple of 16.
-  aqlArgBuf = amd::alignUp(aqlArgBuf, 16);
-  assert((aqlArgBuf == (gpu.cb(0)->SysMemCopy() + argsBufferSize())) &&
-         "Size and the number of arguments don't match!");
-#endif  // !defined(WITH_LIGHTNING_COMPILER)
-  hsa_kernel_dispatch_packet_t* hsaDisp =
-      reinterpret_cast<hsa_kernel_dispatch_packet_t*>(gpu.cb(0)->SysMemCopy() + argsBufferSize());
+  //hsa_kernel_dispatch_packet_t disp;
+  hsa_kernel_dispatch_packet_t* hsaDisp = reinterpret_cast<hsa_kernel_dispatch_packet_t*>(
+    gpu.cb(0)->SysMemCopy());
 
   amd::NDRange local(sizes.local());
   const amd::NDRange& global = sizes.global();
@@ -1117,6 +1015,12 @@ hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments(
   // Check if runtime has to find local workgroup size
   findLocalWorkSize(sizes.dimensions(), sizes.global(), local);
 
+  constexpr uint16_t kDispatchPacketHeader =
+    (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
+    (1 << HSA_PACKET_HEADER_BARRIER) |
+    (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
+    (HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
+
   hsaDisp->header = kDispatchPacketHeader;
   hsaDisp->setup = sizes.dimensions();
 
@@ -1134,28 +1038,16 @@ hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments(
   hsaDisp->group_segment_size = ldsAddress - ldsSize();
   hsaDisp->kernel_object = gpuAqlCode();
 
-  const ConstantBuffer* cb = gpu.cb(0);
-  uint64_t argList = cb->UploadDataToHw(
-    argsBufferSize() + sizeof(hsa_kernel_dispatch_packet_t));
-
   hsaDisp->kernarg_address = reinterpret_cast<void*>(argList);
   hsaDisp->reserved2 = 0;
   hsaDisp->completion_signal.handle = 0;
+  memcpy(aqlArgBuf + argsBufferSize(), hsaDisp, sizeof(hsa_kernel_dispatch_packet_t));
 
-  gpu.addVmMemory(cb->ActiveMemory());
-  gpu.addVmMemory(&prog().codeSegGpu());
-  for (pal::Memory* mem : prog().globalStores()) {
-    gpu.addVmMemory(mem);
-  }
   if (AMD_HSA_BITS_GET(cpuAqlCode_->kernel_code_properties,
-                       AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_QUEUE_PTR)) {
+      AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_QUEUE_PTR)) {
     gpu.addVmMemory(gpu.hsaQueueMem());
   }
 
-  if (srdResource || prog().isStaticSampler()) {
-    dev().srds().fillResourceList(gpu);
-  }
-
   return hsaDisp;
 }
 
@@ -1398,6 +1290,8 @@ static inline cl_kernel_arg_type_qualifier GetOclTypeQual(const KernelArgMD& lcA
 
 void LightningKernel::initArgList(const KernelMD& kernelMD) {
   device::Kernel::parameters_t params;
+  device::Kernel::parameters_t hiddenParams;
+  size_t offsetStruct = argsBufferSize();
 
   size_t offset = 0;
 
@@ -1426,20 +1320,27 @@ void LightningKernel::initArgList(const KernelMD& kernelMD) {
 
     arg->index_ = isHidden ? uint(-1) : params.size();
     arguments_.push_back(arg);
-
-    if (isHidden) {
-      continue;
-    }
-
     // Initialize Device kernel parameters
     amd::KernelParameterDescriptor desc;
 
+    if (isHidden) {
+      offset = amd::alignUp(offset, arguments_[i]->alignment_);
+      desc.offset_ = offset;
+      desc.size_ = arguments_[i]->size_;
+      offset += arguments_[i]->size_;
+      desc.info_.oclObject_ = GetOclArgumentType(arguments_[i]);
+      hiddenParams.push_back(desc);
+      continue;
+    }
+
     desc.name_ = lcArg.mName.c_str();
     desc.type_ = GetOclType(arg);
     desc.addressQualifier_ = GetOclAddrQual(arg);
     desc.accessQualifier_ = GetOclAccessQual(arg);
     desc.typeQualifier_ = GetOclTypeQual(lcArg);
     desc.typeName_ = lcArg.mTypeName.c_str();
+    desc.info_.oclObject_ = GetOclArgumentType(arg);
+    desc.info_.arrayIndex_ = arg->pointeeAlignment_;
 
     // Make a check if it is local or global
     if (desc.addressQualifier_ == CL_KERNEL_ARG_ADDRESS_LOCAL) {
@@ -1456,14 +1357,37 @@ void LightningKernel::initArgList(const KernelMD& kernelMD) {
       // Local memory for CPU
       size = sizeof(cl_mem);
     }
-    offset = (size_t)amd::alignUp(offset, std::min(size, size_t(16)));
-    desc.offset_ = offset;
-    offset += amd::alignUp(size, sizeof(uint32_t));
+    // Check if HSAIL expects data by reference and allocate it behind
+    if (arguments_[i]->type_ == HSAIL_ARGTYPE_REFERENCE) {
+      desc.offset_ = offsetStruct;
+      // Align the offset reference
+      offset = amd::alignUp(offset, sizeof(size_t));
+      patchReferences_.insert({ desc.offset_, offset });
+      offsetStruct += size;
+      // Adjust the offset of arguments
+      offset += sizeof(size_t);
+    }
+    else {
+      // These objects have forced data size to uint64_t
+      if ((desc.info_.oclObject_ == amd::KernelParameterDescriptor::ImageObject) ||
+          (desc.info_.oclObject_ == amd::KernelParameterDescriptor::SamplerObject) ||
+          (desc.info_.oclObject_ == amd::KernelParameterDescriptor::QueueObject)) {
+        offset = amd::alignUp(offset, sizeof(uint64_t));
+        desc.offset_ = offset;
+        offset += sizeof(uint64_t);
+      } else {
+        offset = amd::alignUp(offset, arguments_[i]->alignment_);
+        desc.offset_ = offset;
+        offset += size;
+      }
+    }
+    // Update read only flag
+    desc.info_.readOnly_ = (arguments_[i]->access_ == HSAIL_ACCESS_TYPE_RO) ? true : false;
 
     params.push_back(desc);
   }
 
-  createSignature(params);
+  createSignature(params, hiddenParams, amd::KernelSignature::ABIVersion_1);
 }
 
 static const KernelMD* FindKernelMetadata(const CodeObjectMD* programMD, const std::string& name) {
diff --git a/projects/clr/rocclr/runtime/device/pal/palkernel.hpp b/projects/clr/rocclr/runtime/device/pal/palkernel.hpp
index 66e4132055..7ffc144c8b 100644
--- a/projects/clr/rocclr/runtime/device/pal/palkernel.hpp
+++ b/projects/clr/rocclr/runtime/device/pal/palkernel.hpp
@@ -182,7 +182,7 @@ class HSAILKernel : public device::Kernel {
       const amd::Kernel& kernel,           //!< AMD kernel object
       const amd::NDRangeContainer& sizes,  //!< NDrange container
       const_address parameters,            //!< Application arguments for the kernel
-      bool nativeMem,                      //!< Native memory objects are passed
+      size_t ldsAddress,                   //!< LDS address that includes all arguments.
       uint64_t vmDefQueue,                 //!< GPU VM default queue pointer
       uint64_t* vmParentWrap               //!< GPU VM parent aql wrap object
       ) const;
@@ -204,6 +204,8 @@ class HSAILKernel : public device::Kernel {
     return waveLimiter_.getWavesPerSH(vdev);
   };
 
+  const std::unordered_map<size_t, size_t>& patch() const { return patchReferences_; }
+
  private:
   //! Disable copy constructor
   HSAILKernel(const HSAILKernel&);
@@ -234,6 +236,7 @@ class HSAILKernel : public device::Kernel {
   const HSAILProgram& prog_;          //!< Reference to the parent program
   std::vector<PrintfInfo> printf_;    //!< Format strings for GPU printf support
   uint index_;                        //!< Kernel index in the program
+  std::unordered_map<size_t, size_t> patchReferences_;  //!< Patch table for references
 
   uint64_t code_;    //!< GPU memory pointer to the kernel
   size_t codeSize_;  //!< Size of ISA code
diff --git a/projects/clr/rocclr/runtime/device/pal/palmemory.hpp b/projects/clr/rocclr/runtime/device/pal/palmemory.hpp
index 3f47b22e12..00fd8736d3 100644
--- a/projects/clr/rocclr/runtime/device/pal/palmemory.hpp
+++ b/projects/clr/rocclr/runtime/device/pal/palmemory.hpp
@@ -219,6 +219,8 @@ class Image : public pal::Memory {
                                size_t* slicePitch = NULL    //!< Slice for the mapped memory
                                );
 
+  virtual uint64_t virtualAddress() const override { return hwSrd(); }
+
  private:
   //! Disable copy constructor
   Image(const Image&);
diff --git a/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp b/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp
index b2e373a4f5..7c45951176 100644
--- a/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp
+++ b/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp
@@ -461,9 +461,8 @@ void VirtualGPU::MemoryDependency::validate(VirtualGPU& gpu, const Memory* memor
   if (flushL1Cache) {
     // Flush cache
     if (!gpu.profiling()) {
-        gpu.addBarrier();
+      gpu.addBarrier();
     }
-
     // Clear memory dependency state
     const static bool All = true;
     clear(!All);
@@ -2112,13 +2111,12 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
       return false;
     }
   }
-
+  size_t ldsSize;
   // Check memory dependency and SVM objects
-  if (!processMemObjectsHSA(kernel, parameters, nativeMem)) {
+  if (!processMemObjectsHSA(kernel, parameters, nativeMem, ldsSize)) {
       LogError("Wrong memory objects!");
       return false;
   }
-
   bool needFlush = false;
   // Avoid flushing when PerfCounter is enabled, to make sure PerfStart/dispatch/PerfEnd
   // are in the same cmdBuffer
@@ -2194,7 +2192,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
     uint64_t vmParentWrap = 0;
     // Program the kernel arguments for the GPU execution
     hsa_kernel_dispatch_packet_t* aqlPkt = hsaKernel.loadArguments(
-      *this, kernel, tmpSizes, parameters, nativeMem, vmDefQueue, &vmParentWrap);
+      *this, kernel, tmpSizes, parameters, ldsSize, vmDefQueue, &vmParentWrap);
     if (nullptr == aqlPkt) {
       LogError("Couldn't load kernel arguments");
       return false;
@@ -2948,7 +2946,7 @@ void VirtualGPU::profileEvent(EngineType engine, bool type) const {
 }
 
 bool VirtualGPU::processMemObjectsHSA(const amd::Kernel& kernel, const_address params,
-                                      bool nativeMem) {
+                                      bool nativeMem, size_t& ldsAddress) {
   const amd::KernelParameters& kernelParams = kernel.parameters();
 
   // Mark the tracker with a new kernel,
@@ -3015,68 +3013,155 @@ bool VirtualGPU::processMemObjectsHSA(const amd::Kernel& kernel, const_address p
     }
   }
 
+  bool srdResource = false;
   amd::Memory* const* memories =
       reinterpret_cast<amd::Memory* const*>(params + kernelParams.memoryObjOffset());
   const HSAILKernel& hsaKernel =
       static_cast<const HSAILKernel&>(*(kernel.getDeviceKernel(dev())));
   const amd::KernelSignature& signature = kernel.signature();
+  ldsAddress = hsaKernel.ldsSize();
 
-  // Check all parameters for the current kernel
-  for (size_t i = 0; i < signature.numParameters(); ++i) {
-    const amd::KernelParameterDescriptor& desc = signature.at(i);
-    const HSAILKernel::Argument* arg = hsaKernel.argumentAt(i);
-
-    // Find if current argument is a buffer
-    if ((desc.type_ == T_POINTER) && (arg->addrQual_ != HSAIL_ADDRESS_LOCAL)) {
-      Memory* gpuMem = nullptr;
-      amd::Memory* mem = nullptr;
-      uint32_t index = desc.info_.arrayIndex_;
-      if (nativeMem) {
-        gpuMem = reinterpret_cast<Memory* const*>(memories)[index];
-        if (nullptr != gpuMem) {
-          mem = gpuMem->owner();
-        }
-      } else {
-        mem = memories[index];
-        if (mem != nullptr) {
-          gpuMem = dev().getGpuMemory(mem);
-          // Synchronize data with other memory instances if necessary
-          gpuMem->syncCacheFromHost(*this);
-        }
-      }
-      //! This condition is for SVM fine-grain
-      if ((gpuMem == nullptr) && dev().isFineGrainedSystem(true)) {
-        addBarrier();
-        // Clear memory dependency state
-        const static bool All = true;
-        memoryDependency().clear(!All);
-        continue;
-      } else if (gpuMem != nullptr) {
-        // Check image
-        bool readOnly = (desc.accessQualifier_ == CL_KERNEL_ARG_ACCESS_READ_ONLY) ? true : false;
-        // Check buffer
-        readOnly |= (arg->access_ == HSAIL_ACCESS_TYPE_RO) ? true : false;
-        // Validate memory for a dependency in the queue
-        memoryDependency().validate(*this, gpuMem, readOnly);
-
-        // Wait for resource if it was used on an inactive engine
-        //! \note syncCache may call DRM transfer
-        constexpr bool WaitOnBusyEngine = true;
-        gpuMem->wait(*this, WaitOnBusyEngine);
-
-        //! Check if compiler expects read/write
-        if ((mem != nullptr) && !desc.info_.readOnly_) {
-          mem->signalWrite(&dev());
-        }
-        addVmMemory(gpuMem);
+  if (!nativeMem) {
+    // Process cache coherency first, since the extra transfers may affect
+    // other mem dependency tracking logic: TS and signalWrite()
+    for (uint i = 0; i < signature.numMemories(); ++i) {
+      amd::Memory* mem = memories[i];
+      if (mem != nullptr) {
+        // Synchronize data with other memory instances if necessary
+        dev().getGpuMemory(mem)->syncCacheFromHost(*this);
       }
     }
   }
 
-  for (pal::Memory* mem : hsaKernel.prog().globalStores()) {
+  // Check all parameters for the current kernel
+  for (size_t i = 0; i < signature.numParameters(); ++i) {
+    const amd::KernelParameterDescriptor& desc = signature.at(i);
+    const amd::KernelParameterDescriptor::InfoData& info = desc.info_;
+
+    // Find if current argument is a buffer
+    if (desc.type_ == T_POINTER) {
+      // If it is a local pointer
+      if (desc.size_ == 0) {
+        ldsAddress = amd::alignUp(ldsAddress, desc.info_.arrayIndex_);
+        // Save the original LDS size
+        size_t ldsSize = *reinterpret_cast<const size_t*>(params + desc.offset_);
+        // Patch the LDS address in the original arguments with an LDS address(offset)
+        WriteAqlArgAt(const_cast<address>(params), &ldsAddress, sizeof(void*), desc.offset_);
+        // Add the original size
+        ldsAddress += ldsSize;
+      } else {
+        Memory* gpuMem = nullptr;
+        amd::Memory* mem = nullptr;
+        uint32_t index = info.arrayIndex_;
+        if (nativeMem) {
+          gpuMem = reinterpret_cast<Memory* const*>(memories)[index];
+          if (nullptr != gpuMem) {
+            mem = gpuMem->owner();
+          }
+        } else {
+          mem = memories[index];
+          if (mem != nullptr) {
+            gpuMem = dev().getGpuMemory(mem);
+          }
+        }
+        //! This condition is for SVM fine-grain
+        if ((gpuMem == nullptr) && dev().isFineGrainedSystem(true)) {
+          addBarrier();
+          // Clear memory dependency state
+          const static bool All = true;
+          memoryDependency().clear(!All);
+          continue;
+        } else if (gpuMem != nullptr) {
+          // Validate memory for a dependency in the queue
+          memoryDependency().validate(*this, gpuMem, info.readOnly_);
+          // Wait for resource if it was used on an inactive engine
+          //! \note syncCache may call DRM transfer
+          constexpr bool WaitOnBusyEngine = true;
+          gpuMem->wait(*this, WaitOnBusyEngine);
+
+          addVmMemory(gpuMem);
+
+          //! Check if compiler expects read/write.
+          //! Note: SVM with subbuffers has an issue with tracking.
+          //! Conformance can send read only subbuffer, but update the region
+          //! in the kernel.
+          if ((mem != nullptr) &&
+              ((!info.readOnly_ && (mem->getSvmPtr() == nullptr)) ||
+               ((mem->getMemFlags() & CL_MEM_READ_ONLY) == 0))) {
+            mem->signalWrite(&dev());
+          }
+          if (info.oclObject_ == amd::KernelParameterDescriptor::ImageObject) {
+            //! \note Special case for the image views.
+            //! Copy SRD to CB1, so blit manager will be able to release
+            //! this view without a wait for SRD resource.
+            if (gpuMem->memoryType() == Resource::ImageView) {
+              // Copy the current image SRD into CB1
+              uint64_t srd = cb(1)->UploadDataToHw(gpuMem->hwState(), HsaImageObjectSize);
+              // Then use a pointer in aqlArgBuffer to CB1
+              // Patch the GPU VA address in the original arguments
+              WriteAqlArgAt(const_cast<address>(params), &srd, sizeof(srd), desc.offset_);
+              addVmMemory(cb(1)->ActiveMemory());
+            } else {
+              srdResource = true;
+            }
+            if (gpuMem->desc().isDoppTexture_) {
+              addDoppRef(gpuMem, kernel.parameters().getExecNewVcop(),
+                kernel.parameters().getExecPfpaVcop());
+            }
+          }
+        }
+      }
+    }
+    else if (desc.type_ == T_VOID) {
+      if (desc.info_.oclObject_ == amd::KernelParameterDescriptor::ReferenceObject) {
+        // Copy the current structure into CB1
+        size_t gpuPtr = static_cast<size_t>(cb(1)->UploadDataToHw(params, desc.size_));
+        // Then use a pointer in aqlArgBuffer to CB1
+        const auto it = hsaKernel.patch().find(desc.offset_);
+        // Patch the GPU VA address in the original arguments
+        WriteAqlArgAt(const_cast<address>(params), &gpuPtr, sizeof(size_t), it->second);
+        addVmMemory(cb(1)->ActiveMemory());
+      }
+    }
+    else if (desc.type_ == T_SAMPLER) {
+      srdResource = true;
+    } else if (desc.type_ == T_QUEUE) {
+      uint32_t index = desc.info_.arrayIndex_;
+      const amd::DeviceQueue* queue = reinterpret_cast<amd::DeviceQueue* const*>(
+        params + kernelParams.queueObjOffset())[index];
+      VirtualGPU* gpuQueue = static_cast<VirtualGPU*>(queue->vDev());
+      uint64_t vmQueue;
+      if (dev().settings().useDeviceQueue_) {
+        vmQueue = gpuQueue->vQueue()->vmAddress();
+      } else {
+        if (!createVirtualQueue(queue->size())) {
+          LogError("Virtual queue creation failed!");
+          return false;
+        }
+        vmQueue = vQueue()->vmAddress();
+      }
+      // Patch the GPU VA address in the original arguments
+      WriteAqlArgAt(const_cast<address>(params), &vmQueue, sizeof(vmQueue), desc.offset_);
+      break;
+    }
+  }
+
+  if (ldsAddress > dev().info().localMemSize_) {
+    LogError("No local memory available\n");
+    return false;
+  }
+
+  if (srdResource || hsaKernel.prog().isStaticSampler()) {
+    dev().srds().fillResourceList(*this);
+  }
+
+  addVmMemory(&hsaKernel.prog().codeSegGpu());
+
+  for (const pal::Memory* mem : hsaKernel.prog().globalStores()) {
     const static bool IsReadOnly = false;
     // Validate global store for a dependency in the queue
     memoryDependency().validate(*this, mem, IsReadOnly);
+    addVmMemory(mem);
   }
 
   return true;
diff --git a/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp b/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp
index fa48024c9d..fccee6d60e 100644
--- a/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp
+++ b/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp
@@ -378,6 +378,9 @@ class VirtualGPU : public device::VirtualDevice {
   //! Return xfer buffer for staging operations
   XferBuffer& xferWrite() { return writeBuffer_; }
 
+  //! Return managed buffer for staging operations
+  ManagedBuffer& managedBuffer() { return managedBuffer_; }
+
   //! Adds a pinned memory object into a map
   void addPinnedMem(amd::Memory* mem);
 
@@ -529,7 +532,8 @@ class VirtualGPU : public device::VirtualDevice {
   //! Detects memory dependency for HSAIL kernels and flushes caches
   bool processMemObjectsHSA(const amd::Kernel& kernel,  //!< AMD kernel object for execution
                             const_address params,       //!< Pointer to the param's store
-                            bool nativeMem              //!< Native memory objects
+                            bool nativeMem,             //!< Native memory objects
+                            size_t& ldsAddess         //!< Returns LDS size, used in the kernel
                             );
 
   //! Common function for fill memory used by both svm Fill and non-svm fill
@@ -644,4 +648,33 @@ uint VirtualGPU::Queue::submit(bool forceFlush) {
   return id;
 }
 
+template <typename T>
+inline void WriteAqlArgAt(
+  unsigned char* dst,   //!< The write pointer to the buffer
+  const T* src,         //!< The source pointer
+  uint size,            //!< The size in bytes to copy
+  size_t offset         //!< The alignment to follow while writing to the buffer
+) {
+  memcpy(dst + offset, src, size);
+}
+
+template <>
+inline void WriteAqlArgAt(
+  unsigned char* dst,   //!< The write pointer to the buffer
+  const uint32_t* src,  //!< The source pointer
+  uint size,            //!< The size in bytes to copy
+  size_t offset         //!< The alignment to follow while writing to the buffer
+) {
+  *(reinterpret_cast<uint32_t*>(dst + offset)) = *src;
+}
+
+template <>
+inline void WriteAqlArgAt(
+  unsigned char* dst,   //!< The write pointer to the buffer
+  const uint64_t* src,  //!< The source pointer
+  uint size,            //!< The size in bytes to copy
+  size_t offset         //!< The alignment to follow while writing to the buffer
+) {
+  *(reinterpret_cast<uint64_t*>(dst + offset)) = *src;
+}
 /*@}*/} // namespace pal
diff --git a/projects/clr/rocclr/runtime/device/rocm/rockernel.cpp b/projects/clr/rocclr/runtime/device/rocm/rockernel.cpp
index f0bf4e95c6..b7cf1ed1c3 100644
--- a/projects/clr/rocclr/runtime/device/rocm/rockernel.cpp
+++ b/projects/clr/rocclr/runtime/device/rocm/rockernel.cpp
@@ -581,7 +581,8 @@ void HSAILKernel::initArguments(const aclArgData* aclArg) {
 
     params.push_back(desc);
   }
-  createSignature(params);
+  device::Kernel::parameters_t hiddenParams;
+  createSignature(params, hiddenParams, amd::KernelSignature::ABIVersion_0);
 }
 #endif // defined(WITH_COMPILER_LIB)
 
@@ -660,8 +661,8 @@ void LightningKernel::initArguments(const KernelMD& kernelMD) {
 
     params.push_back(desc);
   }
-
-  createSignature(params);
+  device::Kernel::parameters_t hiddenParams;
+  createSignature(params, hiddenParams, amd::KernelSignature::ABIVersion_0);
 }
 #endif  // defined(WITH_LIGHTNING_COMPILER)
 
diff --git a/projects/clr/rocclr/runtime/platform/kernel.cpp b/projects/clr/rocclr/runtime/platform/kernel.cpp
index 38fc9cabe0..a4616033aa 100644
--- a/projects/clr/rocclr/runtime/platform/kernel.cpp
+++ b/projects/clr/rocclr/runtime/platform/kernel.cpp
@@ -243,13 +243,17 @@ void KernelParameters::release(address mem, const amd::Device& device) const {
 }
 
 KernelSignature::KernelSignature(const std::vector<KernelParameterDescriptor>& params,
-  const std::string& attrib)
+  const std::string& attrib,
+    const std::vector<KernelParameterDescriptor>& hiddenParams,
+    uint32_t version)
   : params_(params)
+  , hiddenParams_(hiddenParams)
   , attributes_(attrib)
   , paramsSize_(0)
   , numMemories_(0)
   , numSamplers_(0)
-  , numQueues_(0) {
+  , numQueues_(0)
+  , version_(version) {
   size_t maxOffset = 0;
   size_t last = 0;
   // Find the last entry
@@ -283,7 +287,15 @@ KernelSignature::KernelSignature(const std::vector<KernelParameterDescriptor>& p
     if (lastSize == 0 /* local mem */) {
       lastSize = sizeof(cl_mem);
     }
-    paramsSize_ = params[last].offset_ + alignUp(lastSize, sizeof(intptr_t));
+    // Note: It's a special case. HW ABI expects 64 bit for SRD, regardless of the binary.
+    // Force the size to 64 bit for those cases.
+    if ((params[last].info_.oclObject_ == amd::KernelParameterDescriptor::ImageObject) ||
+        (params[last].info_.oclObject_ == amd::KernelParameterDescriptor::SamplerObject) ||
+        (params[last].info_.oclObject_ == amd::KernelParameterDescriptor::QueueObject)) {
+      lastSize = alignUp(lastSize, sizeof(uint64_t));
+    }
+    paramsSize_ = params[last].offset_ + lastSize;
+    paramsSize_ = alignUp(paramsSize_, sizeof(intptr_t));
   }
 }
 }  // namespace amd
diff --git a/projects/clr/rocclr/runtime/platform/kernel.hpp b/projects/clr/rocclr/runtime/platform/kernel.hpp
index 838c5d7198..f506241ee0 100644
--- a/projects/clr/rocclr/runtime/platform/kernel.hpp
+++ b/projects/clr/rocclr/runtime/platform/kernel.hpp
@@ -36,18 +36,30 @@ class Program;
 class KernelSignature : public HeapObject {
  private:
   std::vector<KernelParameterDescriptor> params_;
+  std::vector<KernelParameterDescriptor> hiddenParams_;
   std::string attributes_;  //!< The kernel attributes
   uint32_t  paramsSize_;
   uint32_t  numMemories_;
   uint32_t  numSamplers_;
   uint32_t  numQueues_;
+  uint32_t  version_;
 
  public:
+  enum {
+    ABIVersion_0 = 0,   //! ABI constructed based on the OCL semantics
+    ABIVersion_1 = 1    //! ABI constructed based on the HW ABI returned from the compiler
+  };
+
   //! Default constructor
-  KernelSignature() : paramsSize_(0), numMemories_(0), numSamplers_(0), numQueues_(0) {}
+  KernelSignature():
+    paramsSize_(0), numMemories_(0), numSamplers_(0),
+    numQueues_(0), version_(ABIVersion_0) {}
 
   //! Construct a new signature.
-  KernelSignature(const std::vector<KernelParameterDescriptor>& params, const std::string& attrib);
+  KernelSignature(const std::vector<KernelParameterDescriptor>& params,
+    const std::string& attrib,
+    const std::vector<KernelParameterDescriptor>& hiddenParams,
+    uint32_t version);
 
   //! Return the number of parameters
   size_t numParameters() const { return params_.size(); }
@@ -72,8 +84,17 @@ class KernelSignature : public HeapObject {
   //! Returns the number of queue objects.
   uint32_t numQueues() const { return numQueues_; }
 
+  //! Returns the signature version
+  uint32_t version() const { return version_; }
+
   //! Return the kernel attributes
   const std::string& attributes() const { return attributes_; }
+
+  const std::vector<KernelParameterDescriptor>& hiddenParameters() const
+    { return hiddenParams_; }
+
+  const std::vector<KernelParameterDescriptor>& parameters() const
+    { return params_; }
 };
 
 // @todo: look into a copy-on-write model instead of copy-on-read.
diff --git a/projects/clr/rocclr/runtime/platform/program.cpp b/projects/clr/rocclr/runtime/platform/program.cpp
index 055c351b07..9b68004437 100644
--- a/projects/clr/rocclr/runtime/platform/program.cpp
+++ b/projects/clr/rocclr/runtime/platform/program.cpp
@@ -604,8 +604,8 @@ bool Program::ParseAllOptions(const std::string& options, option::Options& parse
 }
 
 bool Symbol::setDeviceKernel(const Device& device, const device::Kernel* func) {
-  // FIXME_lmoriche: check that the signatures are compatible
-  if (deviceKernels_.size() == 0) {
+  if (deviceKernels_.size() == 0 ||
+      (func->signature().version() > KernelSignature::ABIVersion_0)) {
     signature_ = func->signature();
   }
   deviceKernels_[&device] = func;