P4 to Git Change 1767752 by wchau@wchau_OCL_Linux on 2019/04/09 22:58:03

SWDEV-165259 - Update OpenCL runtime to support MsgPack metadata - Add support for the V3 code objects Affected files ... ... //depot/stg/opencl/drivers/opencl/runtime/device/devkernel.cpp#19 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/devkernel.hpp#14 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/devprogram.cpp#39 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/devprogram.hpp#24 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpukernel.cpp#336 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpukernel.hpp#134 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palbe/inc/core/palCmdBuffer.h#63 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palbe/src/core/hw/gfxip/gfx6/gfx6ComputeCmdBuffer.cpp#63 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palbe/src/core/hw/gfxip/gfx9/gfx9ComputeCmdBuffer.cpp#69 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palkernel.cpp#77 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palkernel.hpp#27 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palprogram.cpp#90 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palsettings.cpp#76 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palsettings.hpp#21 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.cpp#130 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rockernel.cpp#52 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rockernel.hpp#27 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocprogram.cpp#103 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocprogram.hpp#47 edit [ROCm/clr commit: 36a5f2a85f]
2019-04-09 23:24:10 -04:00
@@ -29,10 +29,487 @@ using llvm::AMDGPU::HSAMD::ValueType;

 namespace device {

+#if defined(USE_COMGR_LIBRARY)
+static amd_comgr_status_t populateArgs(const amd_comgr_metadata_node_t key,
+                                       const amd_comgr_metadata_node_t value,
+                                       void *data) {
+  amd_comgr_status_t status;
+  amd_comgr_metadata_kind_t kind;
+  std::string buf;
+
+  // get the key of the argument field
+  size_t size = 0;
+  status = amd::Comgr::get_metadata_kind(key, &kind);
+  if (kind == AMD_COMGR_METADATA_KIND_STRING && status == AMD_COMGR_STATUS_SUCCESS) {
+    status = getMetaBuf(key, &buf);
+  }
+
+  if (status != AMD_COMGR_STATUS_SUCCESS) {
+    return AMD_COMGR_STATUS_ERROR;
+  }
+
+  auto itArgField = ArgFieldMap.find(buf);
+  if (itArgField == ArgFieldMap.end()) {
+    return AMD_COMGR_STATUS_ERROR;
+  }
+
+  // get the value of the argument field
+  status = getMetaBuf(value, &buf);
+
+  KernelArgMD* lcArg = static_cast<KernelArgMD*>(data);
+
+  switch (itArgField->second) {
+    case ArgField::Name:
+      lcArg->mName = buf;
+      break;
+    case ArgField::TypeName:
+      lcArg->mTypeName = buf;
+      break;
+    case ArgField::Size:
+      lcArg->mSize = atoi(buf.c_str());
+      break;
+    case ArgField::Align:
+      lcArg->mAlign = atoi(buf.c_str());
+      break;
+    case ArgField::ValueKind:
+      {
+        auto itValueKind = ArgValueKind.find(buf);
+        if (itValueKind == ArgValueKind.end()) {
+          return AMD_COMGR_STATUS_ERROR;
+        }
+        lcArg->mValueKind = itValueKind->second;
+      }
+      break;
+    case ArgField::ValueType:
+      {
+        auto itValueType = ArgValueType.find(buf);
+        if (itValueType == ArgValueType.end()) {
+          return AMD_COMGR_STATUS_ERROR;
+       }
+       lcArg->mValueType = itValueType->second;
+      }
+      break;
+    case ArgField::PointeeAlign:
+      lcArg->mPointeeAlign = atoi(buf.c_str());
+      break;
+    case ArgField::AddrSpaceQual:
+      {
+        auto itAddrSpaceQual = ArgAddrSpaceQual.find(buf);
+        if (itAddrSpaceQual == ArgAddrSpaceQual.end()) {
+          return AMD_COMGR_STATUS_ERROR;
+        }
+        lcArg->mAddrSpaceQual = itAddrSpaceQual->second;
+      }
+      break;
+    case ArgField::AccQual:
+      {
+        auto itAccQual = ArgAccQual.find(buf);
+        if (itAccQual == ArgAccQual.end()) {
+          return AMD_COMGR_STATUS_ERROR;
+        }
+        lcArg->mAccQual = itAccQual->second;
+      }
+      break;
+    case ArgField::ActualAccQual:
+      {
+        auto itAccQual = ArgAccQual.find(buf);
+        if (itAccQual == ArgAccQual.end()) {
+            return AMD_COMGR_STATUS_ERROR;
+        }
+        lcArg->mActualAccQual = itAccQual->second;
+      }
+      break;
+    case ArgField::IsConst:
+      lcArg->mIsConst = (buf.compare("true") == 0);
+      break;
+    case ArgField::IsRestrict:
+      lcArg->mIsRestrict = (buf.compare("true") == 0);
+      break;
+    case ArgField::IsVolatile:
+      lcArg->mIsVolatile = (buf.compare("true") == 0);
+      break;
+    case ArgField::IsPipe:
+      lcArg->mIsPipe = (buf.compare("true") == 0);
+      break;
+    default:
+      return AMD_COMGR_STATUS_ERROR;
+  }
+  return AMD_COMGR_STATUS_SUCCESS;
+}
+
+static amd_comgr_status_t populateAttrs(const amd_comgr_metadata_node_t key,
+                                        const amd_comgr_metadata_node_t value,
+                                        void *data) {
+  amd_comgr_status_t status;
+  amd_comgr_metadata_kind_t kind;
+  size_t size = 0;
+  std::string buf;
+
+  // get the key of the argument field
+  status = amd::Comgr::get_metadata_kind(key, &kind);
+  if (kind == AMD_COMGR_METADATA_KIND_STRING && status == AMD_COMGR_STATUS_SUCCESS) {
+    status = getMetaBuf(key, &buf);
+  }
+
+  if (status != AMD_COMGR_STATUS_SUCCESS) {
+    return AMD_COMGR_STATUS_ERROR;
+  }
+
+  auto itAttrField = AttrFieldMap.find(buf);
+  if (itAttrField == AttrFieldMap.end()) {
+    return AMD_COMGR_STATUS_ERROR;
+  }
+
+  KernelMD* kernelMD = static_cast<KernelMD*>(data);
+  switch (itAttrField->second) {
+    case AttrField::ReqdWorkGroupSize:
+      {
+        status = amd::Comgr::get_metadata_list_size(value, &size);
+        if (size == 3 && status == AMD_COMGR_STATUS_SUCCESS) {
+          for (size_t i = 0; i < size && status == AMD_COMGR_STATUS_SUCCESS; i++) {
+            amd_comgr_metadata_node_t workgroupSize;
+            status = amd::Comgr::index_list_metadata(value, i, &workgroupSize);
+
+            if (status == AMD_COMGR_STATUS_SUCCESS &&
+                getMetaBuf(workgroupSize, &buf) == AMD_COMGR_STATUS_SUCCESS) {
+              kernelMD->mAttrs.mReqdWorkGroupSize.push_back(atoi(buf.c_str()));
+            }
+            amd::Comgr::destroy_metadata(workgroupSize);
+          }
+        }
+      }
+      break;
+    case AttrField::WorkGroupSizeHint:
+      {
+        status = amd::Comgr::get_metadata_list_size(value, &size);
+        if (status == AMD_COMGR_STATUS_SUCCESS && size == 3) {
+          for (size_t i = 0; i < size && status == AMD_COMGR_STATUS_SUCCESS; i++) {
+            amd_comgr_metadata_node_t workgroupSizeHint;
+            status = amd::Comgr::index_list_metadata(value, i, &workgroupSizeHint);
+
+            if (status == AMD_COMGR_STATUS_SUCCESS &&
+                getMetaBuf(workgroupSizeHint, &buf) == AMD_COMGR_STATUS_SUCCESS) {
+              kernelMD->mAttrs.mWorkGroupSizeHint.push_back(atoi(buf.c_str()));
+            }
+            amd::Comgr::destroy_metadata(workgroupSizeHint);
+          }
+        }
+      }
+      break;
+    case AttrField::VecTypeHint:
+      {
+        if (getMetaBuf(value,&buf) == AMD_COMGR_STATUS_SUCCESS) {
+          kernelMD->mAttrs.mVecTypeHint = buf;
+        }
+      }
+      break;
+    case AttrField::RuntimeHandle:
+      {
+        if (getMetaBuf(value,&buf) == AMD_COMGR_STATUS_SUCCESS) {
+          kernelMD->mAttrs.mRuntimeHandle = buf;
+        }
+      }
+      break;
+    default:
+      return AMD_COMGR_STATUS_ERROR;
+  }
+
+  return status;
+}
+
+static amd_comgr_status_t populateCodeProps(const amd_comgr_metadata_node_t key,
+                                            const amd_comgr_metadata_node_t value,
+                                            void *data) {
+  amd_comgr_status_t status;
+  amd_comgr_metadata_kind_t kind;
+  std::string buf;
+
+  // get the key of the argument field
+  status = amd::Comgr::get_metadata_kind(key, &kind);
+  if (kind == AMD_COMGR_METADATA_KIND_STRING && status == AMD_COMGR_STATUS_SUCCESS) {
+    status = getMetaBuf(key, &buf);
+  }
+
+  if (status != AMD_COMGR_STATUS_SUCCESS) {
+    return AMD_COMGR_STATUS_ERROR;
+  }
+
+  auto itCodePropField = CodePropFieldMap.find(buf);
+  if (itCodePropField == CodePropFieldMap.end()) {
+    return AMD_COMGR_STATUS_ERROR;
+  }
+
+  // get the value of the argument field
+  if (status == AMD_COMGR_STATUS_SUCCESS) {
+    status = getMetaBuf(value, &buf);
+  }
+
+  KernelMD*  kernelMD = static_cast<KernelMD*>(data);
+  switch (itCodePropField->second) {
+    case CodePropField::KernargSegmentSize:
+      kernelMD->mCodeProps.mKernargSegmentSize = atoi(buf.c_str());
+      break;
+    case CodePropField::GroupSegmentFixedSize:
+      kernelMD->mCodeProps.mGroupSegmentFixedSize = atoi(buf.c_str());
+      break;
+    case CodePropField::PrivateSegmentFixedSize:
+      kernelMD->mCodeProps.mPrivateSegmentFixedSize = atoi(buf.c_str());
+      break;
+    case CodePropField::KernargSegmentAlign:
+      kernelMD->mCodeProps.mKernargSegmentAlign = atoi(buf.c_str());
+      break;
+    case CodePropField::WavefrontSize:
+      kernelMD->mCodeProps.mWavefrontSize = atoi(buf.c_str());
+      break;
+    case CodePropField::NumSGPRs:
+      kernelMD->mCodeProps.mNumSGPRs = atoi(buf.c_str());
+      break;
+    case CodePropField::NumVGPRs:
+      kernelMD->mCodeProps.mNumVGPRs = atoi(buf.c_str());
+      break;
+    case CodePropField::MaxFlatWorkGroupSize:
+      kernelMD->mCodeProps.mMaxFlatWorkGroupSize = atoi(buf.c_str());
+      break;
+    case CodePropField::IsDynamicCallStack:
+        kernelMD->mCodeProps.mIsDynamicCallStack = (buf.compare("true") == 0);
+      break;
+    case CodePropField::IsXNACKEnabled:
+      kernelMD->mCodeProps.mIsXNACKEnabled = (buf.compare("true") == 0);
+      break;
+    case CodePropField::NumSpilledSGPRs:
+      kernelMD->mCodeProps.mNumSpilledSGPRs = atoi(buf.c_str());
+      break;
+    case CodePropField::NumSpilledVGPRs:
+      kernelMD->mCodeProps.mNumSpilledVGPRs = atoi(buf.c_str());
+      break;
+    default:
+      return AMD_COMGR_STATUS_ERROR;
+  }
+  return AMD_COMGR_STATUS_SUCCESS;
+}
+
+static amd_comgr_status_t populateArgsV3(const amd_comgr_metadata_node_t key,
+                                         const amd_comgr_metadata_node_t value,
+                                         void *data) {
+  amd_comgr_status_t status;
+  amd_comgr_metadata_kind_t kind;
+  std::string buf;
+
+  // get the key of the argument field
+  size_t size = 0;
+  status = amd::Comgr::get_metadata_kind(key, &kind);
+  if (kind == AMD_COMGR_METADATA_KIND_STRING && status == AMD_COMGR_STATUS_SUCCESS) {
+    status = getMetaBuf(key, &buf);
+  }
+
+  if (status != AMD_COMGR_STATUS_SUCCESS) {
+    return AMD_COMGR_STATUS_ERROR;
+  }
+
+  auto itArgField = ArgFieldMapV3.find(buf);
+  if (itArgField == ArgFieldMapV3.end()) {
+    return AMD_COMGR_STATUS_ERROR;
+  }
+
+  // get the value of the argument field
+  status = getMetaBuf(value, &buf);
+
+  KernelArgMD* lcArg = static_cast<KernelArgMD*>(data);
+
+  switch (itArgField->second) {
+    case ArgField::Name:
+      lcArg->mName = buf;
+      break;
+    case ArgField::TypeName:
+      lcArg->mTypeName = buf;
+      break;
+    case ArgField::Size:
+      lcArg->mSize = atoi(buf.c_str());
+      break;
+    case ArgField::Offset:
+      lcArg->mOffset = atoi(buf.c_str());
+      break;
+    case ArgField::ValueKind:
+      {
+        auto itValueKind = ArgValueKindV3.find(buf);
+        if (itValueKind == ArgValueKindV3.end()) {
+          return AMD_COMGR_STATUS_ERROR;
+        }
+        lcArg->mValueKind = itValueKind->second;
+      }
+      break;
+    case ArgField::ValueType:
+      {
+        auto itValueType = ArgValueTypeV3.find(buf);
+        if (itValueType == ArgValueTypeV3.end()) {
+          return AMD_COMGR_STATUS_ERROR;
+       }
+       lcArg->mValueType = itValueType->second;
+      }
+      break;
+    case ArgField::PointeeAlign:
+      lcArg->mPointeeAlign = atoi(buf.c_str());
+      break;
+    case ArgField::AddrSpaceQual:
+      {
+        auto itAddrSpaceQual = ArgAddrSpaceQualV3.find(buf);
+        if (itAddrSpaceQual == ArgAddrSpaceQualV3.end()) {
+          return AMD_COMGR_STATUS_ERROR;
+        }
+        lcArg->mAddrSpaceQual = itAddrSpaceQual->second;
+      }
+      break;
+    case ArgField::AccQual:
+      {
+        auto itAccQual = ArgAccQualV3.find(buf);
+        if (itAccQual == ArgAccQualV3.end()) {
+          return AMD_COMGR_STATUS_ERROR;
+        }
+        lcArg->mAccQual = itAccQual->second;
+      }
+      break;
+    case ArgField::ActualAccQual:
+      {
+        auto itAccQual = ArgAccQualV3.find(buf);
+        if (itAccQual == ArgAccQualV3.end()) {
+            return AMD_COMGR_STATUS_ERROR;
+        }
+        lcArg->mActualAccQual = itAccQual->second;
+      }
+      break;
+    case ArgField::IsConst:
+      lcArg->mIsConst = (buf.compare("true") == 0);
+      break;
+    case ArgField::IsRestrict:
+      lcArg->mIsRestrict = (buf.compare("true") == 0);
+      break;
+    case ArgField::IsVolatile:
+      lcArg->mIsVolatile = (buf.compare("true") == 0);
+      break;
+    case ArgField::IsPipe:
+      lcArg->mIsPipe = (buf.compare("true") == 0);
+      break;
+    default:
+      return AMD_COMGR_STATUS_ERROR;
+  }
+  return AMD_COMGR_STATUS_SUCCESS;
+}
+
+static amd_comgr_status_t populateKernelMetaV3(const amd_comgr_metadata_node_t key,
+                                               const amd_comgr_metadata_node_t value,
+                                               void *data) {
+  amd_comgr_status_t status;
+  amd_comgr_metadata_kind_t kind;
+  size_t size = 0;
+  std::string buf;
+
+  // get the key of the argument field
+  status = amd::Comgr::get_metadata_kind(key, &kind);
+  if (kind == AMD_COMGR_METADATA_KIND_STRING && status == AMD_COMGR_STATUS_SUCCESS) {
+    status = getMetaBuf(key, &buf);
+  }
+
+  if (status != AMD_COMGR_STATUS_SUCCESS) {
+    return AMD_COMGR_STATUS_ERROR;
+  }
+
+  auto itKernelField = KernelFieldMapV3.find(buf);
+  if (itKernelField == KernelFieldMapV3.end()) {
+    return AMD_COMGR_STATUS_ERROR;
+  }
+
+  if (itKernelField->second != KernelField::ReqdWorkGroupSize &&
+      itKernelField->second != KernelField::WorkGroupSizeHint) {
+      status = getMetaBuf(value,&buf);
+  }
+  if (status != AMD_COMGR_STATUS_SUCCESS) {
+    return AMD_COMGR_STATUS_ERROR;
+  }
+
+  KernelMD* kernelMD = static_cast<KernelMD*>(data);
+  switch (itKernelField->second) {
+    case KernelField::ReqdWorkGroupSize:
+      status = amd::Comgr::get_metadata_list_size(value, &size);
+      if (size == 3 && status == AMD_COMGR_STATUS_SUCCESS) {
+        for (size_t i = 0; i < size && status == AMD_COMGR_STATUS_SUCCESS; i++) {
+          amd_comgr_metadata_node_t workgroupSize;
+          status = amd::Comgr::index_list_metadata(value, i, &workgroupSize);
+
+          if (status == AMD_COMGR_STATUS_SUCCESS &&
+              getMetaBuf(workgroupSize, &buf) == AMD_COMGR_STATUS_SUCCESS) {
+            kernelMD->mAttrs.mReqdWorkGroupSize.push_back(atoi(buf.c_str()));
+          }
+          amd::Comgr::destroy_metadata(workgroupSize);
+        }
+      }
+      break;
+    case KernelField::WorkGroupSizeHint:
+      status = amd::Comgr::get_metadata_list_size(value, &size);
+      if (status == AMD_COMGR_STATUS_SUCCESS && size == 3) {
+        for (size_t i = 0; i < size && status == AMD_COMGR_STATUS_SUCCESS; i++) {
+          amd_comgr_metadata_node_t workgroupSizeHint;
+          status = amd::Comgr::index_list_metadata(value, i, &workgroupSizeHint);
+
+          if (status == AMD_COMGR_STATUS_SUCCESS &&
+              getMetaBuf(workgroupSizeHint, &buf) == AMD_COMGR_STATUS_SUCCESS) {
+            kernelMD->mAttrs.mWorkGroupSizeHint.push_back(atoi(buf.c_str()));
+          }
+          amd::Comgr::destroy_metadata(workgroupSizeHint);
+        }
+      }
+      break;
+    case KernelField::VecTypeHint:
+      kernelMD->mAttrs.mVecTypeHint = buf;
+      break;
+    case KernelField::DeviceEnqueueSymbol:
+      kernelMD->mAttrs.mRuntimeHandle = buf;
+      break;
+    case KernelField::KernargSegmentSize:
+      kernelMD->mCodeProps.mKernargSegmentSize = atoi(buf.c_str());
+      break;
+    case KernelField::GroupSegmentFixedSize:
+      kernelMD->mCodeProps.mGroupSegmentFixedSize = atoi(buf.c_str());
+      break;
+    case KernelField::PrivateSegmentFixedSize:
+      kernelMD->mCodeProps.mPrivateSegmentFixedSize = atoi(buf.c_str());
+      break;
+    case KernelField::KernargSegmentAlign:
+      kernelMD->mCodeProps.mKernargSegmentAlign = atoi(buf.c_str());
+      break;
+    case KernelField::WavefrontSize:
+      kernelMD->mCodeProps.mWavefrontSize = atoi(buf.c_str());
+      break;
+    case KernelField::NumSGPRs:
+      kernelMD->mCodeProps.mNumSGPRs = atoi(buf.c_str());
+      break;
+    case KernelField::NumVGPRs:
+      kernelMD->mCodeProps.mNumVGPRs = atoi(buf.c_str());
+      break;
+    case KernelField::MaxFlatWorkGroupSize:
+      kernelMD->mCodeProps.mMaxFlatWorkGroupSize = atoi(buf.c_str());
+      break;
+    case KernelField::NumSpilledSGPRs:
+      kernelMD->mCodeProps.mNumSpilledSGPRs = atoi(buf.c_str());
+      break;
+    case KernelField::NumSpilledVGPRs:
+      kernelMD->mCodeProps.mNumSpilledVGPRs = atoi(buf.c_str());
+      break;
+    case KernelField::SymbolName:
+      kernelMD->mSymbolName = buf;
+      break;
+    default:
+      return AMD_COMGR_STATUS_ERROR;
+  }
+
+  return status;
+}
+#endif
+
 // ================================================================================================
-Kernel::Kernel(const amd::Device& dev, const std::string& name)
+Kernel::Kernel(const amd::Device& dev, const std::string& name, const Program& prog)
  : dev_(dev)
  , name_(name)
+  , prog_(prog)
  , signature_(nullptr)
  , waveLimiter_(this, dev.info().cuPerShaderArray_ * dev.info().simdPerCU_) {
  // Instead of memset(&workGroupInfo_, '\0', sizeof(workGroupInfo_));
@@ -482,6 +959,8 @@ static inline clk_value_type_t GetOclTypeOCL(const aclArgData* argInfo, size_t s

 // ================================================================================================
 #if defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY)
+static inline size_t GetArgOffsetOCL(const KernelArgMD& lcArg) { return lcArg.mOffset; }
+
 static inline size_t GetArgAlignmentOCL(const KernelArgMD& lcArg) { return lcArg.mAlign; }
 #endif

@@ -771,11 +1250,10 @@ static inline cl_kernel_arg_type_qualifier GetOclTypeQualOCL(const aclArgData* a
 // ================================================================================================
 #if defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY)
 #if defined(USE_COMGR_LIBRARY)
-bool Kernel::GetAttrCodePropMetadata(const amd_comgr_metadata_node_t kernelMetaNode,
-                                     const uint32_t kernargSegmentByteSize,
-                                     KernelMD* kernelMD) {
+bool Kernel::GetAttrCodePropMetadata( const amd_comgr_metadata_node_t kernelMetaNode,
+                                      KernelMD* kernelMD) {

-  InitParameters(kernelMetaNode, kernargSegmentByteSize);
+  InitParameters(kernelMetaNode);

  // Set the workgroup information for the kernel
  workGroupInfo_.availableLDSSize_ = dev().info().localMemSizePerCU_;
@@ -784,25 +1262,49 @@ bool Kernel::GetAttrCodePropMetadata(const amd_comgr_metadata_node_t kernelMetaN
  workGroupInfo_.availableVGPRs_ = 256;

  // extract the attribute metadata if there is any
-  amd_comgr_metadata_node_t attrMeta;
  amd_comgr_status_t status = AMD_COMGR_STATUS_SUCCESS;
-  if (amd::Comgr::metadata_lookup(kernelMetaNode, "Attrs", &attrMeta) == AMD_COMGR_STATUS_SUCCESS) {
-    status = amd::Comgr::iterate_map_metadata(attrMeta, device::populateAttrs,
-                                            static_cast<void*>(kernelMD));
-    amd::Comgr::destroy_metadata(attrMeta);
+
+  switch (codeObjectVer()) {
+    case 2: {
+        amd_comgr_metadata_node_t symbolName;
+        status = amd::Comgr::metadata_lookup(kernelMetaNode, "SymbolName", &symbolName);
+        if (status == AMD_COMGR_STATUS_SUCCESS) {
+          status = getMetaBuf(symbolName, &(kernelMD->mSymbolName));
+          amd::Comgr::destroy_metadata(symbolName);
+        }
+
+        amd_comgr_metadata_node_t attrMeta;
+        if (status == AMD_COMGR_STATUS_SUCCESS) {
+          if (amd::Comgr::metadata_lookup(kernelMetaNode, "Attrs", &attrMeta) ==
+              AMD_COMGR_STATUS_SUCCESS) {
+            status = amd::Comgr::iterate_map_metadata(attrMeta, populateAttrs,
+                                                      static_cast<void*>(kernelMD));
+            amd::Comgr::destroy_metadata(attrMeta);
+          }
+        }
+
+        // extract the code properties metadata
+        amd_comgr_metadata_node_t codePropsMeta;
+        if (status == AMD_COMGR_STATUS_SUCCESS) {
+          status = amd::Comgr::metadata_lookup(kernelMetaNode, "CodeProps", &codePropsMeta);
+        }
+
+        if (status == AMD_COMGR_STATUS_SUCCESS) {
+          status = amd::Comgr::iterate_map_metadata(codePropsMeta, populateCodeProps,
+                                                    static_cast<void*>(kernelMD));
+          amd::Comgr::destroy_metadata(codePropsMeta);
+        }
+      }
+      break;
+    case 3: {
+        status = amd::Comgr::iterate_map_metadata(kernelMetaNode, populateKernelMetaV3,
+                                                  static_cast<void*>(kernelMD));
+      }
+      break;
+    default:
+      return false;
  }

-  // extract the code properties metadata
-  amd_comgr_metadata_node_t codePropsMeta;
-  if (status == AMD_COMGR_STATUS_SUCCESS) {
-    status = amd::Comgr::metadata_lookup(kernelMetaNode, "CodeProps", &codePropsMeta);
-  }
-
-  if (status == AMD_COMGR_STATUS_SUCCESS) {
-    status = amd::Comgr::iterate_map_metadata(codePropsMeta, device::populateCodeProps,
-                                            static_cast<void*>(kernelMD));
-    amd::Comgr::destroy_metadata(codePropsMeta);
-  }

  if (status != AMD_COMGR_STATUS_SUCCESS) {
    return false;
@@ -917,19 +1419,21 @@ bool Kernel::GetPrintfStr(const amd_comgr_metadata_node_t programMD,
  return (status == AMD_COMGR_STATUS_SUCCESS);
 }

-void Kernel::InitParameters(const amd_comgr_metadata_node_t kernelMD, uint32_t argBufferSize) {
+void Kernel::InitParameters(const amd_comgr_metadata_node_t kernelMD) {
  // Iterate through the arguments and insert into parameterList
  device::Kernel::parameters_t params;
  device::Kernel::parameters_t hiddenParams;
  amd::KernelParameterDescriptor desc;
  size_t offset = 0;
-  size_t offsetStruct = argBufferSize;

  amd_comgr_metadata_node_t argsMeta;
  bool hsaArgsMeta = false;
  size_t argsSize;

-  amd_comgr_status_t status =  amd::Comgr::metadata_lookup(kernelMD, "Args", &argsMeta);
+  amd_comgr_status_t status =  amd::Comgr::metadata_lookup(
+                                          kernelMD,
+                                          (codeObjectVer() == 2) ? "Args" : ".args",
+                                          &argsMeta);
  if (status == AMD_COMGR_STATUS_SUCCESS) {
    hsaArgsMeta = true;
    status = amd::Comgr::get_metadata_list_size(argsMeta, &argsSize);
@@ -940,7 +1444,7 @@ void Kernel::InitParameters(const amd_comgr_metadata_node_t kernelMD, uint32_t a
  }

  for (size_t i = 0; i < argsSize; ++i) {
-    KernelArgMD  lcArg;
+    KernelArgMD lcArg;

    amd_comgr_metadata_node_t argsNode;
    amd_comgr_metadata_kind_t kind;
@@ -956,7 +1460,13 @@ void Kernel::InitParameters(const amd_comgr_metadata_node_t kernelMD, uint32_t a
      status = AMD_COMGR_STATUS_ERROR;
    }
    if (status == AMD_COMGR_STATUS_SUCCESS) {
-      status = amd::Comgr::iterate_map_metadata(argsNode, populateArgs, static_cast<void*>(&lcArg));
+      void *data = static_cast<void*>(&lcArg);
+      if (codeObjectVer() == 2) {
+        status = amd::Comgr::iterate_map_metadata(argsNode, populateArgs, data);
+      }
+      else if (codeObjectVer() == 3) {
+        status = amd::Comgr::iterate_map_metadata(argsNode, populateArgsV3, data);
+      }
    }

    if (hsaArgsNode) {
@@ -971,7 +1481,7 @@ void Kernel::InitParameters(const amd_comgr_metadata_node_t kernelMD, uint32_t a
    }

    size_t size = GetArgSizeOCL(lcArg);
-    size_t alignment = GetArgAlignmentOCL(lcArg);
+    size_t alignment = (codeObjectVer() == 2) ? GetArgAlignmentOCL(lcArg) : 0;
    bool isHidden = false;
    desc.info_.oclObject_ = GetOclArgumentTypeOCL(lcArg, &isHidden);

@@ -980,7 +1490,7 @@ void Kernel::InitParameters(const amd_comgr_metadata_node_t kernelMD, uint32_t a
      if (desc.info_.oclObject_ == amd::KernelParameterDescriptor::HiddenCompletionAction) {
        setDynamicParallelFlag(true);
      }
-      offset = amd::alignUp(offset, alignment);
+      offset = (codeObjectVer() == 2) ? amd::alignUp(offset, alignment) : GetArgOffsetOCL(lcArg);
      desc.offset_ = offset;
      desc.size_ = size;
      offset += size;
@@ -1007,7 +1517,7 @@ void Kernel::InitParameters(const amd_comgr_metadata_node_t kernelMD, uint32_t a
      offset += sizeof(uint64_t);
    }
    else {
-      offset = amd::alignUp(offset, alignment);
+      offset = (codeObjectVer() == 2) ? amd::alignUp(offset, alignment) : GetArgOffsetOCL(lcArg);
      desc.offset_ = offset;
      offset += size;
    }
@@ -34,6 +34,7 @@ using llvm::AMDGPU::HSAMD::AddressSpaceQualifier;
 using llvm::AMDGPU::HSAMD::ValueKind;
 using llvm::AMDGPU::HSAMD::ValueType;

+//  for Code Object V3
 enum class ArgField : uint8_t {
  Name          = 0,
  TypeName      = 1,
@@ -48,7 +49,8 @@ enum class ArgField : uint8_t {
  IsConst       = 10,
  IsRestrict    = 11,
  IsVolatile    = 12,
-  IsPipe        = 13
+  IsPipe        = 13,
+  Offset        = 14
 };

 enum class AttrField : uint8_t {
@@ -167,6 +169,116 @@ static const std::map<std::string,CodePropField> CodePropFieldMap =
  {"NumSpilledSGPRs",         CodePropField::NumSpilledSGPRs},
  {"NumSpilledVGPRs",         CodePropField::NumSpilledVGPRs}
 };
+
+//  for Code Object V3
+enum class KernelField : uint8_t {
+  SymbolName              = 0,
+  ReqdWorkGroupSize       = 1,
+  WorkGroupSizeHint       = 2,
+  VecTypeHint             = 3,
+  DeviceEnqueueSymbol     = 4,
+  KernargSegmentSize      = 5,
+  GroupSegmentFixedSize   = 6,
+  PrivateSegmentFixedSize = 7,
+  KernargSegmentAlign     = 8,
+  WavefrontSize           = 9,
+  NumSGPRs                = 10,
+  NumVGPRs                = 11,
+  MaxFlatWorkGroupSize    = 12,
+  NumSpilledSGPRs         = 13,
+  NumSpilledVGPRs         = 14
+};
+
+static const std::map<std::string,ArgField> ArgFieldMapV3 =
+{
+  {".name",           ArgField::Name},
+  {".type_name",      ArgField::TypeName},
+  {".size",           ArgField::Size},
+  {".offset",         ArgField::Offset},
+  {".value_kind",     ArgField::ValueKind},
+  {".vaule_type",     ArgField::ValueType},
+  {".pointee_align",  ArgField::PointeeAlign},
+  {".address_space",  ArgField::AddrSpaceQual},
+  {".access",         ArgField::AccQual},
+  {".actual_access",  ArgField::ActualAccQual},
+  {".is_const",       ArgField::IsConst},
+  {".is_restrict",    ArgField::IsRestrict},
+  {".is_volatile",    ArgField::IsVolatile},
+  {".is_pipe",        ArgField::IsPipe}
+};
+
+static const std::map<std::string,ValueKind> ArgValueKindV3 =
+{
+  {"by_value",                  ValueKind::ByValue},
+  {"global_buffer",             ValueKind::GlobalBuffer},
+  {"dynamic_shared_pointer",    ValueKind::DynamicSharedPointer},
+  {"sampler",                   ValueKind::Sampler},
+  {"image",                     ValueKind::Image},
+  {"pipe",                      ValueKind::Pipe},
+  {"queue",                     ValueKind::Queue},
+  {"hidden_global_offset_x",    ValueKind::HiddenGlobalOffsetX},
+  {"hidden_global_offset_y",    ValueKind::HiddenGlobalOffsetY},
+  {"hidden_global_offset_z",    ValueKind::HiddenGlobalOffsetZ},
+  {"hidden_none",               ValueKind::HiddenNone},
+  {"hidden_printf_buffer",      ValueKind::HiddenPrintfBuffer},
+  {"hidden_default_queue",      ValueKind::HiddenDefaultQueue},
+  {"hidden_completion_action",  ValueKind::HiddenCompletionAction}
+};
+
+static const std::map<std::string,ValueType> ArgValueTypeV3 =
+{
+  {"struct",  ValueType::Struct},
+  {"i8",      ValueType::I8},
+  {"u8",      ValueType::U8},
+  {"i16",     ValueType::I16},
+  {"u16",     ValueType::U16},
+  {"f16",     ValueType::F16},
+  {"i32",     ValueType::I32},
+  {"u32",     ValueType::U32},
+  {"f32",     ValueType::F32},
+  {"i64",     ValueType::I64},
+  {"u64",     ValueType::U64},
+  {"f64",     ValueType::F64}
+};
+
+static const std::map<std::string,AccessQualifier> ArgAccQualV3 =
+{
+  {"default",    AccessQualifier::Default},
+  {"read_only",  AccessQualifier::ReadOnly},
+  {"write_only", AccessQualifier::WriteOnly},
+  {"read_write", AccessQualifier::ReadWrite}
+};
+
+static const std::map<std::string,AddressSpaceQualifier> ArgAddrSpaceQualV3 =
+{
+  {"private",   AddressSpaceQualifier::Private},
+  {"global",    AddressSpaceQualifier::Global},
+  {"constant",  AddressSpaceQualifier::Constant},
+  {"local",     AddressSpaceQualifier::Local},
+  {"generic",   AddressSpaceQualifier::Generic},
+  {"region",    AddressSpaceQualifier::Region}
+};
+
+static const std::map<std::string,KernelField> KernelFieldMapV3 =
+{
+  {".symbol",                     KernelField::SymbolName},
+  {".reqd_workgroup_size",        KernelField::ReqdWorkGroupSize},
+  {".workgorup_size_hint",        KernelField::WorkGroupSizeHint},
+  {".vec_type_hint",              KernelField::VecTypeHint},
+  {".device_enqueue_symbol",      KernelField::DeviceEnqueueSymbol},
+  {".kernarg_segment_size",       KernelField::KernargSegmentSize},
+  {".group_segment_fixed_size",   KernelField::GroupSegmentFixedSize},
+  {".private_segment_fixed_size", KernelField::PrivateSegmentFixedSize},
+  {".kernarg_segment_align",      KernelField::KernargSegmentAlign},
+  {".wavefront_size",             KernelField::WavefrontSize},
+  {".sgpr_count",                 KernelField::NumSGPRs},
+  {".vgpr_count",                 KernelField::NumVGPRs},
+  {".max_flat_workgroup_size",    KernelField::MaxFlatWorkGroupSize},
+  {".sgpr_spill_count",           KernelField::NumSpilledSGPRs},
+  {".vgpr_spill_count",           KernelField::NumSpilledVGPRs}
+};
+
+
 #endif  // defined(USE_COMGR_LIBRARY)
 #endif  // defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY)

@@ -234,6 +346,8 @@ struct KernelParameterDescriptor {

 namespace device {

+class Program;
+
 //! Printf info structure
 struct PrintfInfo {
  std::string fmtString_;        //!< formated string for printf
@@ -272,7 +386,7 @@ class Kernel : public amd::HeapObject {
  };

  //! Default constructor
-  Kernel(const amd::Device& dev, const std::string& name);
+  Kernel(const amd::Device& dev, const std::string& name, const Program& prog);

  //! Default destructor
  virtual ~Kernel();
@@ -372,7 +486,7 @@ class Kernel : public amd::HeapObject {
  //! Initializes the abstraction layer kernel parameters
 #if defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY)
 #if defined(USE_COMGR_LIBRARY)
-  void InitParameters(const amd_comgr_metadata_node_t kernelMD, uint32_t argBufferSize);
+  void InitParameters(const amd_comgr_metadata_node_t kernelMD);

  //! Get ther kernel metadata
  bool GetKernelMetadata(const amd_comgr_metadata_node_t programMD,
@@ -381,7 +495,6 @@ class Kernel : public amd::HeapObject {

  //! Retrieve kernel attribute and code properties metadata
  bool GetAttrCodePropMetadata(const amd_comgr_metadata_node_t kernelMetaNode,
-                               const uint32_t kernargSegmentByteSize,
                               KernelMD* kernelMD);

  //! Retrieve the available SGPRs and VGPRs
@@ -390,6 +503,12 @@ class Kernel : public amd::HeapObject {
  //! Retrieve the printf string metadata
  bool GetPrintfStr(const amd_comgr_metadata_node_t programMD,
                    std::vector<std::string>* printfStr);
+
+  //! Returns the kernel symbol name
+  const std::string& symbolName() const { return symbolName_; }
+
+  //! Returns the kernel code object version
+  const uint32_t codeObjectVer() const { return prog().codeObjectVer(); }
 #else
  void InitParameters(const KernelMD& kernelMD, uint32_t argBufferSize);
 #endif
@@ -404,8 +523,13 @@ class Kernel : public amd::HeapObject {
  //! Initializes HSAIL Printf metadata and info
  void InitPrintf(const aclPrintfFmt* aclPrintf);
 #endif
+  //! Returns program associated with this kernel
+  const Program& prog() const { return prog_; }
+
  const amd::Device& dev_;          //!< GPU device object
  std::string name_;                //!< kernel name
+  const Program& prog_;             //!< Reference to the parent program
+  std::string symbolName_;          //!< kernel symbol name
  WorkGroupInfo workGroupInfo_;     //!< device kernel info structure
  amd::KernelSignature* signature_; //!< kernel signature
  std::string buildLog_;            //!< build log
@@ -424,6 +548,7 @@ class Kernel : public amd::HeapObject {
    Flags() : value_(0) {}
  } flags_;

+
 private:
  //! Disable default copy constructor
  Kernel(const Kernel&);
@@ -447,264 +572,5 @@ static amd_comgr_status_t getMetaBuf(const amd_comgr_metadata_node_t meta,

  return status;
 }
-
-static amd_comgr_status_t populateArgs(const amd_comgr_metadata_node_t key,
-                                       const amd_comgr_metadata_node_t value,
-                                       void *data) {
-  amd_comgr_status_t status;
-  amd_comgr_metadata_kind_t kind;
-  std::string buf;
-
-  // get the key of the argument field
-  size_t size = 0;
-  status = amd::Comgr::get_metadata_kind(key, &kind);
-  if (kind == AMD_COMGR_METADATA_KIND_STRING && status == AMD_COMGR_STATUS_SUCCESS) {
-    status = getMetaBuf(key, &buf);
-  }
-
-  if (status != AMD_COMGR_STATUS_SUCCESS) {
-    return AMD_COMGR_STATUS_ERROR;
-  }
-
-  auto itArgField = ArgFieldMap.find(buf);
-  if (itArgField == ArgFieldMap.end()) {
-    return AMD_COMGR_STATUS_ERROR;
-  }
-
-  // get the value of the argument field
-  status = getMetaBuf(value, &buf);
-
-  KernelArgMD* lcArg = static_cast<KernelArgMD*>(data);
-
-  switch (itArgField->second) {
-    case ArgField::Name:
-      lcArg->mName = buf;
-      break;
-    case ArgField::TypeName:
-      lcArg->mTypeName = buf;
-      break;
-    case ArgField::Size:
-      lcArg->mSize = atoi(buf.c_str());
-      break;
-    case ArgField::Align:
-      lcArg->mAlign = atoi(buf.c_str());
-      break;
-    case ArgField::ValueKind:
-      {
-        auto itValueKind = ArgValueKind.find(buf);
-        if (itValueKind == ArgValueKind.end()) {
-          return AMD_COMGR_STATUS_ERROR;
-        }
-        lcArg->mValueKind = itValueKind->second;
-      }
-      break;
-    case ArgField::ValueType:
-      {
-        auto itValueType = ArgValueType.find(buf);
-        if (itValueType == ArgValueType.end()) {
-          return AMD_COMGR_STATUS_ERROR;
-       }
-       lcArg->mValueType = itValueType->second;
-      }
-      break;
-    case ArgField::PointeeAlign:
-      lcArg->mPointeeAlign = atoi(buf.c_str());
-      break;
-    case ArgField::AddrSpaceQual:
-      {
-        auto itAddrSpaceQual = ArgAddrSpaceQual.find(buf);
-        if (itAddrSpaceQual == ArgAddrSpaceQual.end()) {
-          return AMD_COMGR_STATUS_ERROR;
-        }
-        lcArg->mAddrSpaceQual = itAddrSpaceQual->second;
-      }
-      break;
-    case ArgField::AccQual:
-      {
-        auto itAccQual = ArgAccQual.find(buf);
-        if (itAccQual == ArgAccQual.end()) {
-          return AMD_COMGR_STATUS_ERROR;
-        }
-        lcArg->mAccQual = itAccQual->second;
-      }
-      break;
-    case ArgField::ActualAccQual:
-      {
-        auto itAccQual = ArgAccQual.find(buf);
-        if (itAccQual == ArgAccQual.end()) {
-            return AMD_COMGR_STATUS_ERROR;
-        }
-        lcArg->mActualAccQual = itAccQual->second;
-      }
-      break;
-    case ArgField::IsConst:
-      lcArg->mIsConst = (buf.compare("true") == 0);
-      break;
-    case ArgField::IsRestrict:
-      lcArg->mIsRestrict = (buf.compare("true") == 0);
-      break;
-    case ArgField::IsVolatile:
-      lcArg->mIsVolatile = (buf.compare("true") == 0);
-      break;
-    case ArgField::IsPipe:
-      lcArg->mIsPipe = (buf.compare("true") == 0);
-      break;
-    default:
-      return AMD_COMGR_STATUS_ERROR;
-  }
-  return AMD_COMGR_STATUS_SUCCESS;
-}
-
-static amd_comgr_status_t populateAttrs(const amd_comgr_metadata_node_t key,
-                                        const amd_comgr_metadata_node_t value,
-                                        void *data) {
-  amd_comgr_status_t status;
-  amd_comgr_metadata_kind_t kind;
-  size_t size = 0;
-  std::string buf;
-
-  // get the key of the argument field
-  status = amd::Comgr::get_metadata_kind(key, &kind);
-  if (kind == AMD_COMGR_METADATA_KIND_STRING && status == AMD_COMGR_STATUS_SUCCESS) {
-    status = getMetaBuf(key, &buf);
-  }
-
-  if (status != AMD_COMGR_STATUS_SUCCESS) {
-    return AMD_COMGR_STATUS_ERROR;
-  }
-
-  auto itAttrField = AttrFieldMap.find(buf);
-  if (itAttrField == AttrFieldMap.end()) {
-    return AMD_COMGR_STATUS_ERROR;
-  }
-
-  KernelMD* kernelMD = static_cast<KernelMD*>(data);
-  switch (itAttrField->second) {
-    case AttrField::ReqdWorkGroupSize:
-      {
-        status = amd::Comgr::get_metadata_list_size(value, &size);
-        if (size == 3 && status == AMD_COMGR_STATUS_SUCCESS) {
-          for (size_t i = 0; i < size && status == AMD_COMGR_STATUS_SUCCESS; i++) {
-            amd_comgr_metadata_node_t workgroupSize;
-            status = amd::Comgr::index_list_metadata(value, i, &workgroupSize);
-
-            if (status == AMD_COMGR_STATUS_SUCCESS &&
-                getMetaBuf(workgroupSize, &buf) == AMD_COMGR_STATUS_SUCCESS) {
-              kernelMD->mAttrs.mReqdWorkGroupSize.push_back(atoi(buf.c_str()));
-            }
-            amd::Comgr::destroy_metadata(workgroupSize);
-          }
-        }
-      }
-      break;
-    case AttrField::WorkGroupSizeHint:
-      {
-        status = amd::Comgr::get_metadata_list_size(value, &size);
-        if (status == AMD_COMGR_STATUS_SUCCESS && size == 3) {
-          for (size_t i = 0; i < size && status == AMD_COMGR_STATUS_SUCCESS; i++) {
-            amd_comgr_metadata_node_t workgroupSizeHint;
-            status = amd::Comgr::index_list_metadata(value, i, &workgroupSizeHint);
-
-            if (status == AMD_COMGR_STATUS_SUCCESS &&
-                getMetaBuf(workgroupSizeHint, &buf) == AMD_COMGR_STATUS_SUCCESS) {
-              kernelMD->mAttrs.mWorkGroupSizeHint.push_back(atoi(buf.c_str()));
-            }
-            amd::Comgr::destroy_metadata(workgroupSizeHint);
-          }
-        }
-      }
-      break;
-    case AttrField::VecTypeHint:
-      {
-        if (getMetaBuf(value,&buf) == AMD_COMGR_STATUS_SUCCESS) {
-          kernelMD->mAttrs.mVecTypeHint = buf;
-        }
-      }
-      break;
-    case AttrField::RuntimeHandle:
-      {
-        if (getMetaBuf(value,&buf) == AMD_COMGR_STATUS_SUCCESS) {
-          kernelMD->mAttrs.mRuntimeHandle = buf;
-        }
-      }
-      break;
-    default:
-      return AMD_COMGR_STATUS_ERROR;
-  }
-
-  return status;
-}
-
-static amd_comgr_status_t populateCodeProps(const amd_comgr_metadata_node_t key,
-                                            const amd_comgr_metadata_node_t value,
-                                            void *data) {
-  amd_comgr_status_t status;
-  amd_comgr_metadata_kind_t kind;
-  std::string buf;
-
-  // get the key of the argument field
-  status = amd::Comgr::get_metadata_kind(key, &kind);
-  if (kind == AMD_COMGR_METADATA_KIND_STRING && status == AMD_COMGR_STATUS_SUCCESS) {
-    status = getMetaBuf(key, &buf);
-  }
-
-  if (status != AMD_COMGR_STATUS_SUCCESS) {
-    return AMD_COMGR_STATUS_ERROR;
-  }
-
-  auto itCodePropField = CodePropFieldMap.find(buf);
-  if (itCodePropField == CodePropFieldMap.end()) {
-    return AMD_COMGR_STATUS_ERROR;
-  }
-
-  // get the value of the argument field
-  if (status == AMD_COMGR_STATUS_SUCCESS) {
-    status = getMetaBuf(value, &buf);
-  }
-
-  KernelMD*  kernelMD = static_cast<KernelMD*>(data);
-  switch (itCodePropField->second) {
-    case CodePropField::KernargSegmentSize:
-      kernelMD->mCodeProps.mKernargSegmentSize = atoi(buf.c_str());
-      break;
-    case CodePropField::GroupSegmentFixedSize:
-      kernelMD->mCodeProps.mKernargSegmentSize = atoi(buf.c_str());
-      break;
-    case CodePropField::PrivateSegmentFixedSize:
-      kernelMD->mCodeProps.mPrivateSegmentFixedSize = atoi(buf.c_str());
-      break;
-    case CodePropField::KernargSegmentAlign:
-      kernelMD->mCodeProps.mKernargSegmentAlign = atoi(buf.c_str());
-      break;
-    case CodePropField::WavefrontSize:
-      kernelMD->mCodeProps.mWavefrontSize = atoi(buf.c_str());
-      break;
-    case CodePropField::NumSGPRs:
-      kernelMD->mCodeProps.mNumSGPRs = atoi(buf.c_str());
-      break;
-    case CodePropField::NumVGPRs:
-      kernelMD->mCodeProps.mNumVGPRs = atoi(buf.c_str());
-      break;
-    case CodePropField::MaxFlatWorkGroupSize:
-      kernelMD->mCodeProps.mMaxFlatWorkGroupSize = atoi(buf.c_str());
-      break;
-    case CodePropField::IsDynamicCallStack:
-        kernelMD->mCodeProps.mIsDynamicCallStack = (buf.compare("true") == 0);
-      break;
-    case CodePropField::IsXNACKEnabled:
-      kernelMD->mCodeProps.mIsXNACKEnabled = (buf.compare("true") == 0);
-      break;
-    case CodePropField::NumSpilledSGPRs:
-      kernelMD->mCodeProps.mNumSpilledSGPRs = atoi(buf.c_str());
-      break;
-    case CodePropField::NumSpilledVGPRs:
-      kernelMD->mCodeProps.mNumSpilledVGPRs = atoi(buf.c_str());
-      break;
-    default:
-      return AMD_COMGR_STATUS_ERROR;
-  }
-  return AMD_COMGR_STATUS_SUCCESS;
-}
-#endif
-
+#endif // defined(USE_COMGR_LIBRARY)
 } // namespace device
@@ -706,8 +706,6 @@ bool Program::compileImplLC(const std::string& sourceCode,
  driverOptions.append(options->llvmOptions);
  driverOptions.append(ProcessOptions(options));

-  // Force object code v2.
-  driverOptions.append(" -mno-code-object-v3");
  // Set whole program mode
  driverOptions.append(" -mllvm -amdgpu-early-inline-all -mllvm -amdgpu-prelink");

@@ -1532,8 +1530,6 @@ bool Program::linkImplLC(amd::option::Options* options) {
    codegenOptions.append(" -mno-sram-ecc");
  }

-  // Force object code v2.
-  codegenOptions.append(" -mno-code-object-v3");
  // Set whole program mode
  codegenOptions.append(" -mllvm -amdgpu-internalize-symbols -mllvm -amdgpu-early-inline-all");

@@ -1764,8 +1760,6 @@ bool Program::linkImplLC(amd::option::Options* options) {
    std::ostream_iterator<std::string>(ostrstr, " "));
  codegenOptions.append(" ").append(ostrstr.str());

-  // Force object code v2.
-  codegenOptions.append(" -mno-code-object-v3");
  // Set whole program mode
  codegenOptions.append(" -mllvm -amdgpu-internalize-symbols -mllvm -amdgpu-early-inline-all");

@@ -2880,8 +2874,19 @@ bool Program::createKernelMetadataMap() {

  status = amd::Comgr::metadata_lookup(*metadata_, "Kernels", &kernelsMD);
  if (status == AMD_COMGR_STATUS_SUCCESS) {
+    LogInfo("Using Code Object V2.");
    hasKernelMD = true;
-    status = amd::Comgr::get_metadata_list_size(kernelsMD, &size);
+    codeObjectVer_ = 2;
+  }
+  else {
+    status = amd::Comgr::metadata_lookup(*metadata_, "amdhsa.kernels", &kernelsMD);
+
+    if (status == AMD_COMGR_STATUS_SUCCESS) {
+      LogInfo("Using Code Object V3.");
+      hasKernelMD = true;
+      codeObjectVer_ = 3;
+      status = amd::Comgr::get_metadata_list_size(kernelsMD, &size);
+    }
  }

  for (size_t i = 0; i < size && status == AMD_COMGR_STATUS_SUCCESS; i++) {
@@ -2896,7 +2901,9 @@ bool Program::createKernelMetadataMap() {

    if (status == AMD_COMGR_STATUS_SUCCESS) {
      hasKernelNode = true;
-      status = amd::Comgr::metadata_lookup(kernelNode, "Name", &nameMeta);
+      status = amd::Comgr::metadata_lookup(kernelNode,
+                                           (codeObjectVer() == 2) ? "Name" : ".name",
+                                           &nameMeta);
    }

    if (status == AMD_COMGR_STATUS_SUCCESS) {
@@ -2970,9 +2977,10 @@ bool Program::FindGlobalVarSize(void* binary, size_t binSize) {
          buildLog_ += "Error: object code with old metadata is not supported\n";
          return false;
        }
-        else if (note->n_type == 10 /* NT_AMD_AMDGPU_HSA_METADATA V2 */ &&
-          note->n_namesz == sizeof "AMD" &&
-          !memcmp(name, "AMD", note->n_namesz)) {
+        else if ((note->n_type == 10 /* NT_AMD_AMDGPU_HSA_METADATA V2 */ &&
+                  note->n_namesz == sizeof "AMD" && !memcmp(name, "AMD", note->n_namesz)) ||
+                (note->n_type == 32 /* NT_AMD_AMDGPU_HSA_METADATA V3 */ &&
+                  note->n_namesz == sizeof "AMDGPU" && !memcmp(name, "AMDGPU", note->n_namesz))) {
 #if defined(USE_COMGR_LIBRARY)
          amd_comgr_status_t status;
          amd_comgr_data_t binaryData;
@@ -118,6 +118,7 @@ class Program : public amd::HeapObject {

 #if defined(USE_COMGR_LIBRARY)
  amd_comgr_metadata_node_t* metadata_;   //!< COMgr metadata
+  uint32_t codeObjectVer_;                //!< version of code object
  std::map<std::string,amd_comgr_metadata_node_t> kernelMetadataMap_; //!< Map of kernel metadata
 #else
  CodeObjectMD* metadata_;  //!< Runtime metadata
@@ -211,6 +212,8 @@ class Program : public amd::HeapObject {
    auto it = kernelMetadataMap_.find(name);
    return (it == kernelMetadataMap_.end()) ? nullptr : &(it->second);
  }
+
+  const uint32_t codeObjectVer() const { return codeObjectVer_; }
 #else
  const CodeObjectMD* metadata() const { return metadata_; }
 #endif
@@ -515,10 +515,9 @@ clk_value_type_t KernelArg::type() const {

 NullKernel::NullKernel(const std::string& name, const NullDevice& gpuNullDev,
                       const NullProgram& nullprog)
-    : device::Kernel(gpuNullDev, name),
+    : device::Kernel(gpuNullDev, name, nullprog),
      buildError_(CL_BUILD_PROGRAM_FAILURE),
      gpuDev_(gpuNullDev),
-      prog_(nullprog),
      calRef_(NULL),
      internal_(false),
      flags_(0),
@@ -3027,9 +3026,8 @@ void HSAILKernel::initHsailArgs(const aclArgData* aclArg) {

 HSAILKernel::HSAILKernel(std::string name, HSAILProgram* prog, std::string compileOptions,
                         uint extraArgsNum)
-    : device::Kernel(prog->dev(), name),
+    : device::Kernel(prog->dev(), name, *prog),
      compileOptions_(compileOptions),
-      prog_(*prog),
      index_(0),
      code_(NULL),
      codeSize_(0),
@@ -406,7 +406,7 @@ class NullKernel : public device::Kernel {
  const NullDevice& nullDev() const { return gpuDev_; }

  //! Returns GPU device object, associated with this kernel
-  const NullProgram& nullProg() const { return prog_; }
+  const NullProgram& nullProg() const { return reinterpret_cast<const NullProgram&>(prog_); }

  //! Returns the kernel's build error
  const cl_int buildError() const { return buildError_; }
@@ -455,7 +455,6 @@ class NullKernel : public device::Kernel {
  std::string ilSource_;  //!< IL source code of this kernel

  const NullDevice& gpuDev_;  //!< GPU device object
-  const NullProgram& prog_;   //!< Reference to the parent program

  CalImageReference* calRef_;  //!< CAL image reference for this kernel
  bool internal_;              //!< Runtime internal ker
@@ -847,7 +846,6 @@ class HSAILKernel : public device::Kernel {
  std::vector<Argument*> arguments_;  //!< Vector list of HSAIL Arguments
  std::string compileOptions_;        //!< compile used for finalizing this kernel
  amd_kernel_code_t* cpuAqlCode_;     //!< AQL kernel code on CPU
-  const HSAILProgram& prog_;          //!< Reference to the parent program
  uint index_;                        //!< Kernel index in the program

  gpu::Memory* code_;  //!< Memory object with ISA code
@@ -26,37 +26,17 @@ typedef llvm::AMDGPU::HSAMD::Kernel::Metadata KernelMD;

 namespace pal {

-bool HSAILKernel::aqlCreateHWInfo(amd::hsa::loader::Symbol* sym) {
-  if (!sym) {
-    return false;
-  }
-  if (!sym->GetInfo(HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, reinterpret_cast<void*>(&code_))) {
-    return false;
-  }
-
-  if (!sym->GetInfo(HSA_EXT_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT_SIZE,
-    reinterpret_cast<void*>(&codeSize_))) {
-    return false;
-  }
-
-  amd_kernel_code_t* akc = &akc_;
-  // Copy codeobject of this kernel from the program CPU segment
-  memcpy(akc, reinterpret_cast<void*>(prog().findHostKernelAddress(code_)), sizeof(amd_kernel_code_t));
-
-  size_t akc_align = 0;
-  if (!sym->GetInfo(HSA_EXT_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT_ALIGN,
-                    reinterpret_cast<void*>(&akc_align))) {
-    return false;
-  }
-
+void HSAILKernel::setWorkGroupInfo(const uint32_t privateSegmentSize,
+                                   const uint32_t groupSegmentSize,
+                                   const uint16_t numSGPRs,
+                                   const uint16_t numVGPRs) {
  workGroupInfo_.scratchRegs_ =
-      amd::alignUp(akc->workitem_private_segment_byte_size, 16) / sizeof(uint);
-  workGroupInfo_.privateMemSize_ = akc->workitem_private_segment_byte_size;
-  workGroupInfo_.localMemSize_ = workGroupInfo_.usedLDSSize_ =
-      akc->workgroup_group_segment_byte_size;
-  workGroupInfo_.usedSGPRs_ = akc->wavefront_sgpr_count;
+      amd::alignUp(privateSegmentSize, 16) / sizeof(uint);
+  workGroupInfo_.privateMemSize_ = privateSegmentSize;
+  workGroupInfo_.localMemSize_ = workGroupInfo_.usedLDSSize_ = groupSegmentSize;
+  workGroupInfo_.usedSGPRs_ = numSGPRs;
  workGroupInfo_.usedStackSize_ = 0;
-  workGroupInfo_.usedVGPRs_ = akc->workitem_vgpr_count;
+  workGroupInfo_.usedVGPRs_ = numVGPRs;

  if (!prog().isNull()) {
    workGroupInfo_.availableLDSSize_ = dev().properties().gfxipProperties.shaderCore.ldsSizePerCu;
@@ -72,16 +52,57 @@ bool HSAILKernel::aqlCreateHWInfo(amd::hsa::loader::Symbol* sym) {
    workGroupInfo_.availableVGPRs_ = 256;
    workGroupInfo_.preferredSizeMultiple_ = workGroupInfo_.wavefrontPerSIMD_ = 64;
  }
+}
+
+bool HSAILKernel::setKernelCode(amd::hsa::loader::Symbol* sym, amd_kernel_code_t* akc) {
+  if (!sym) {
+    return false;
+  }
+  if (!sym->GetInfo(HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, reinterpret_cast<void*>(&code_))) {
+    return false;
+  }
+
+  // Copy code object of this kernel from the program CPU segment
+  memcpy(akc, reinterpret_cast<void*>(prog().findHostKernelAddress(code_)), sizeof(amd_kernel_code_t));
+
+  return true;
+}
+
+bool HSAILKernel::aqlCreateHWInfo(amd::hsa::loader::Symbol* sym) {
+
+  amd_kernel_code_t* akc = &akc_;
+
+  if (!setKernelCode(sym, akc)) {
+    return false;
+  }
+
+  if (!sym->GetInfo(HSA_EXT_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT_SIZE,
+    reinterpret_cast<void*>(&codeSize_))) {
+    return false;
+  }
+
+    // Setup the the workgroup info
+  setWorkGroupInfo(akc->workitem_private_segment_byte_size,
+                   akc->workgroup_group_segment_byte_size,
+                   akc->wavefront_sgpr_count,
+                   akc->workitem_vgpr_count);
+
+  workgroupGroupSegmentByteSize_ = workGroupInfo_.usedLDSSize_;
+  kernargSegmentByteSize_ =  akc->kernarg_segment_byte_size;
+  spillSegmentByteSize_ = amd::alignUp(workGroupInfo_.privateMemSize_, sizeof(uint32_t));
+
  return true;
 }

 HSAILKernel::HSAILKernel(std::string name, HSAILProgram* prog, std::string compileOptions)
-    : device::Kernel(prog->dev(), name),
+    : device::Kernel(prog->dev(), name, *prog),
      compileOptions_(compileOptions),
-      prog_(*prog),
      index_(0),
      code_(0),
-      codeSize_(0)
+      codeSize_(0),
+      workgroupGroupSegmentByteSize_(0),
+      kernargSegmentByteSize_(0),
+      spillSegmentByteSize_(0)
 {
  flags_.hsa_ = true;
 }
@@ -388,37 +409,54 @@ static const KernelMD* FindKernelMetadata(const CodeObjectMD* programMD, const s
 }
 #endif // defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY)

-bool LightningKernel::init(amd::hsa::loader::Symbol* symbol) {
-#if defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY)
+#if defined(USE_COMGR_LIBRARY)
+bool LightningKernel::init() {
  flags_.internalKernel_ =
      (compileOptions_.find("-cl-internal-kernel") != std::string::npos) ? true : false;

-  aqlCreateHWInfo(symbol);
-
-#if defined(USE_COMGR_LIBRARY)
  const amd_comgr_metadata_node_t* kernelMetaNode = prog().getKernelMetadata(name());
  if (kernelMetaNode == nullptr) {
    return false;
  }

  KernelMD  kernelMD;
-  if (!GetAttrCodePropMetadata(*kernelMetaNode, argsBufferSize(), &kernelMD)) {
+  if (!GetAttrCodePropMetadata(*kernelMetaNode, &kernelMD)) {
    return false;
  }

+  symbolName_ = (codeObjectVer() == 2) ? name() : kernelMD.mSymbolName;
+
+  workgroupGroupSegmentByteSize_ = kernelMD.mCodeProps.mGroupSegmentFixedSize;
+  spillSegmentByteSize_ = amd::alignUp(kernelMD.mCodeProps.mPrivateSegmentFixedSize,
+                                       sizeof(uint32_t));
+  kernargSegmentByteSize_ = kernelMD.mCodeProps.mKernargSegmentSize;
+
+  // Copy codeobject of this kernel from the program CPU segment
+  hsa_agent_t agent;
+  agent.handle = 1;
+
+  auto sym = prog().GetSymbol(symbolName().c_str(), const_cast<hsa_agent_t*>(&agent));
+
+  if (!setKernelCode(sym, &akc_)) {
+    return false;
+  }
+
+  codeSize_ = prog().codeSegGpu().owner()->getSize();
+
+  // handle device enqueue
  if (!kernelMD.mAttrs.mRuntimeHandle.empty()) {
    hsa_agent_t agent;
    agent.handle = 1;
    amd::hsa::loader::Symbol* rth_symbol;

    // Get the runtime handle symbol GPU address
-    rth_symbol = prog_.GetSymbol(const_cast<char*>(kernelMD.mAttrs.mRuntimeHandle.c_str()),
+    rth_symbol = prog().GetSymbol(const_cast<char*>(kernelMD.mAttrs.mRuntimeHandle.c_str()),
                                const_cast<hsa_agent_t*>(&agent));
    uint64_t symbol_address;
    rth_symbol->GetInfo(HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, &symbol_address);

    // Copy the kernel_object pointer to the runtime handle symbol GPU address
-    const Memory& codeSegGpu = prog_.codeSegGpu();
+    const Memory& codeSegGpu = prog().codeSegGpu();
    uint64_t offset = symbol_address - codeSegGpu.vmAddress();
    uint64_t kernel_object = gpuAqlCode();
    VirtualGPU* gpu = codeSegGpu.dev().xferQueue();
@@ -432,6 +470,12 @@ bool LightningKernel::init(amd::hsa::loader::Symbol* symbol) {
    codeSegGpu.writeRawData(*gpu, offset, sizeof(runtime_handle), &runtime_handle, true);
  }

+  // Setup the the workgroup info
+  setWorkGroupInfo(kernelMD.mCodeProps.mPrivateSegmentFixedSize,
+                   kernelMD.mCodeProps.mGroupSegmentFixedSize,
+                   kernelMD.mCodeProps.mNumSGPRs,
+                   kernelMD.mCodeProps.mNumVGPRs);
+
  // Copy wavefront size
  workGroupInfo_.wavefrontSize_ = dev().info().wavefrontWidth_;

@@ -452,7 +496,18 @@ bool LightningKernel::init(amd::hsa::loader::Symbol* symbol) {
  if (!printfStr.empty()) {
    InitPrintf(printfStr);
  }
-#else
+
+  return true;
+}
+#endif // defined(USE_COMGR_LIBRARY)
+
+bool LightningKernel::init(amd::hsa::loader::Symbol* symbol) {
+#if defined(WITH_LIGHTNING_COMPILER) && ! defined(USE_COMGR_LIBRARY)
+  flags_.internalKernel_ =
+      (compileOptions_.find("-cl-internal-kernel") != std::string::npos) ? true : false;
+
+  aqlCreateHWInfo(symbol);
+
  const CodeObjectMD* programMD = prog().metadata();
  assert(programMD != nullptr);

@@ -489,13 +544,13 @@ bool LightningKernel::init(amd::hsa::loader::Symbol* symbol) {
    amd::hsa::loader::Symbol* rth_symbol;

    // Get the runtime handle symbol GPU address
-    rth_symbol = prog_.GetSymbol(const_cast<char*>(kernelMD->mAttrs.mRuntimeHandle.c_str()),
+    rth_symbol = prog().GetSymbol(const_cast<char*>(kernelMD->mAttrs.mRuntimeHandle.c_str()),
                                const_cast<hsa_agent_t*>(&agent));
    uint64_t symbol_address;
    rth_symbol->GetInfo(HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, &symbol_address);

    // Copy the kernel_object pointer to the runtime handle symbol GPU address
-    const Memory& codeSegGpu = prog_.codeSegGpu();
+    const Memory& codeSegGpu = prog().codeSegGpu();
    uint64_t offset = symbol_address - codeSegGpu.vmAddress();
    VirtualGPU* gpu = codeSegGpu.dev().xferQueue();

@@ -529,8 +584,7 @@ bool LightningKernel::init(amd::hsa::loader::Symbol* symbol) {

  waveLimiter_.enable();
  */
-#endif // defined(USE_COMGR_LIBRARY)
-#endif // defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY)
+#endif // defined(WITH_LIGHTNING_COMPILER) && ! defined(USE_COMGR_LIBRARY)
  return true;
 }

@@ -59,7 +59,7 @@ class HSAILKernel : public device::Kernel {
  const HSAILProgram& prog() const;

  //! Returns LDS size used in this kernel
-  uint32_t ldsSize() const { return akc_.workgroup_group_segment_byte_size; }
+  uint32_t ldsSize() const { return workgroupGroupSegmentByteSize_; }

  //! Returns pointer on CPU to AQL code info
  const amd_kernel_code_t* cpuAqlCode() const { return &akc_; }
@@ -71,10 +71,10 @@ class HSAILKernel : public device::Kernel {
  size_t aqlCodeSize() const { return codeSize_; }

  //! Returns the size of argument buffer
-  size_t argsBufferSize() const { return akc_.kernarg_segment_byte_size; }
+  size_t argsBufferSize() const { return kernargSegmentByteSize_; }

  //! Returns spill reg size per workitem
-  uint32_t spillSegSize() const { return amd::alignUp(akc_.workitem_private_segment_byte_size, sizeof(uint32_t)); }
+  uint32_t spillSegSize() const { return spillSegmentByteSize_; }

  //! Returns AQL packet in CPU memory
  //! if the kernel arguments were successfully loaded, otherwise NULL
@@ -102,13 +102,25 @@ class HSAILKernel : public device::Kernel {
  //! Creates AQL kernel HW info
  bool aqlCreateHWInfo(amd::hsa::loader::Symbol* sym);

+  //! Get the kernel code and copy the code object from the program CPU segment
+  bool setKernelCode(amd::hsa::loader::Symbol* sym, amd_kernel_code_t* akc);
+
+  //! Set up the workgroup info based on the kernel metadata
+  void setWorkGroupInfo(const uint32_t privateSegmentSize,
+                        const uint32_t groupSegmentSize,
+                        const uint16_t numSGPRs,
+                        const uint16_t numVGPRs);
+
  std::string compileOptions_;    //!< compile used for finalizing this kernel
  amd_kernel_code_t akc_;         //!< AQL kernel code on CPU
-  const HSAILProgram& prog_;      //!< Reference to the parent program
  uint index_;                    //!< Kernel index in the program

  uint64_t code_;     //!< GPU memory pointer to the kernel
  size_t codeSize_;   //!< Size of ISA code
+
+  uint32_t workgroupGroupSegmentByteSize_;    //!< LDS size used in the kernel
+  uint32_t kernargSegmentByteSize_;           //!< Size of kernel argument buffer
+  uint32_t spillSegmentByteSize_;             //!< Spill reg size per workitem
 };

 class LightningKernel : public HSAILKernel {
@@ -121,6 +133,11 @@ class LightningKernel : public HSAILKernel {

  //! Initializes the metadata required for this kernel,
  bool init(amd::hsa::loader::Symbol* symbol);
+
+#if defined(USE_COMGR_LIBRARY)
+  //! Initializes the metadata required for this kernel,
+  bool init();
+#endif
 };

 /*@}*/} // namespace pal
@@ -747,6 +747,25 @@ bool LightningProgram::setKernels(amd::option::Options* options, void* binary, s
    return false;
  }

+#if defined(USE_COMGR_LIBRARY)
+  for (const auto &kernelMeta : kernelMetadataMap_) {
+    auto kernelName = kernelMeta.first;
+    auto kernel = new LightningKernel(kernelName, this,
+                                      options->origOptionStr + ProcessOptions(options));
+    kernels()[kernelName] = kernel;
+
+    if (!kernel->init()) {
+      return false;
+    }
+
+    kernel->setUniformWorkGroupSize(options->oVariables->UniformWorkGroupSize);
+
+    // Find max scratch regs used in the program. It's used for scratch buffer preallocation
+    // with dynamic parallelism, since runtime doesn't know which child kernel will be called
+    maxScratchRegs_ =
+        std::max(static_cast<uint>(kernel->workGroupInfo()->scratchRegs_), maxScratchRegs_);
+  }
+#else
  // Get the list of kernels
  std::vector<std::string> kernelNameList;
  status = executable_->IterateSymbols(GetKernelNamesCallback, &kernelNameList);
@@ -781,7 +800,7 @@ bool LightningProgram::setKernels(amd::option::Options* options, void* binary, s
    maxScratchRegs_ =
        std::max(static_cast<uint>(kernel->workGroupInfo()->scratchRegs_), maxScratchRegs_);
  }
-
+#endif // defined(USE_COMGR_LIBRARY)
  DestroySegmentCpuAccess();
 #endif // defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY)
  return true;
@@ -166,7 +166,11 @@ bool Settings::create(const Pal::DeviceProperties& palProp,
  }

  // Update GPU specific settings and info structure if we have any
+#if defined(_WIN32)
+  ModifyMaxWorkload modifyMaxWorkload = {0, 1, VER_EQUAL};
+#else
  ModifyMaxWorkload modifyMaxWorkload = {0};
+#endif

  // APU systems
  if (palProp.gpuType == Pal::GpuType::Integrated) {
@@ -326,18 +330,22 @@ bool Settings::create(const Pal::DeviceProperties& palProp,
      return false;
  }

-#if defined(_WIN32)
-  if (modifyMaxWorkload.time > 0) {
-    OSVERSIONINFOEX versionInfo = {0};
-    versionInfo.dwOSVersionInfoSize = sizeof(OSVERSIONINFOEX);
-    versionInfo.dwMajorVersion = 6;
-    versionInfo.dwMinorVersion = modifyMaxWorkload.minorVersion;
+  splitSizeForWin7_ = false;

-    DWORDLONG conditionMask = 0;
-    VER_SET_CONDITION(conditionMask, VER_MAJORVERSION, modifyMaxWorkload.comparisonOps);
-    VER_SET_CONDITION(conditionMask, VER_MINORVERSION, modifyMaxWorkload.comparisonOps);
-    if (VerifyVersionInfo(&versionInfo, VER_MAJORVERSION | VER_MINORVERSION, conditionMask)) {
-      maxWorkloadTime_ = modifyMaxWorkload.time;
+#if defined(_WIN32)
+  OSVERSIONINFOEX versionInfo = {0};
+  versionInfo.dwOSVersionInfoSize = sizeof(OSVERSIONINFOEX);
+  versionInfo.dwMajorVersion = 6;
+  versionInfo.dwMinorVersion = modifyMaxWorkload.minorVersion;
+
+  DWORDLONG conditionMask = 0;
+  VER_SET_CONDITION(conditionMask, VER_MAJORVERSION, modifyMaxWorkload.comparisonOps);
+  VER_SET_CONDITION(conditionMask, VER_MINORVERSION, modifyMaxWorkload.comparisonOps);
+
+  if (VerifyVersionInfo(&versionInfo, VER_MAJORVERSION | VER_MINORVERSION, conditionMask)) {
+    splitSizeForWin7_ = true;  // Update flag of DMA flush split size for Win 7
+    if (modifyMaxWorkload.time > 0) {
+      maxWorkloadTime_ = modifyMaxWorkload.time; // Update max workload time
    }
  }
 #endif  // defined(_WIN32)
@@ -60,7 +60,8 @@ class Settings : public device::Settings {
      uint sdamPageFaultWar_ : 1;        //!< SDMA page fault workaround
      uint rgpSqttWaitIdle_: 1;          //!< Wait for idle after SQTT trace
      uint rgpSqttForceDisable_: 1;      //!< Disables SQTT
-      uint reserved_ : 12;
+      uint splitSizeForWin7_: 1;         //!< DMA flush split size for Win 7
+      uint reserved_ : 11;
    };
    uint value_;
  };
@@ -553,6 +553,11 @@ void VirtualGPU::DmaFlushMgmt::resetCbWorkload(const Device& dev) {

 void VirtualGPU::DmaFlushMgmt::findSplitSize(const Device& dev, uint64_t threads,
                                             uint instructions) {
+  if (!dev.settings().splitSizeForWin7_) {
+    dispatchSplitSize_ = 0;
+    return;
+  }
+
  uint64_t workload = threads * instructions;
  if (maxDispatchWorkload_ < workload) {
    dispatchSplitSize_ = static_cast<uint>(maxDispatchWorkload_ / instructions);
@@ -2258,6 +2263,8 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
    dispatchParam.wavesPerSh = (enqueueEvent != nullptr) ?
      enqueueEvent->profilingInfo().waves_ : 0;
    dispatchParam.useAtc = dev().settings().svmFineGrainSystem_ ? true : false;
+    dispatchParam.workitemPrivateSegmentSize = hsaKernel.spillSegSize();
+    dispatchParam.kernargSegmentSize = hsaKernel.argsBufferSize();
    // Run AQL dispatch in HW
    eventBegin(MainEngine);
    iCmd()->CmdDispatchAql(dispatchParam);
@@ -25,41 +25,58 @@ Kernel::Kernel(std::string name, Program* prog, const uint64_t& kernelCodeHandle
               const uint32_t workgroupGroupSegmentByteSize,
               const uint32_t workitemPrivateSegmentByteSize, const uint32_t kernargSegmentByteSize,
               const uint32_t kernargSegmentAlignment)
-    : device::Kernel(prog->dev(), name),
-      program_(prog),
+    : device::Kernel(prog->dev(), name, *prog),
      kernelCodeHandle_(kernelCodeHandle),
      workgroupGroupSegmentByteSize_(workgroupGroupSegmentByteSize),
      workitemPrivateSegmentByteSize_(workitemPrivateSegmentByteSize),
      kernargSegmentByteSize_(kernargSegmentByteSize),
      kernargSegmentAlignment_(kernargSegmentAlignment) {}

+Kernel::Kernel(std::string name, Program* prog)
+    : device::Kernel(prog->dev(), name, *prog),
+      kernelCodeHandle_(0),
+      workgroupGroupSegmentByteSize_(0),
+      workitemPrivateSegmentByteSize_(0),
+      kernargSegmentByteSize_(0),
+      kernargSegmentAlignment_(0) {}
+
 #if defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY)
 #if defined(USE_COMGR_LIBRARY)
 bool LightningKernel::init() {

-  hsa_agent_t hsaDevice = program_->hsaDevice();
+  hsa_agent_t hsaDevice = program()->hsaDevice();

  const amd_comgr_metadata_node_t* kernelMetaNode =
-              static_cast<LightningProgram*>(program_)->getKernelMetadata(name());
+              static_cast<const LightningProgram*>(program())->getKernelMetadata(name());
  if (kernelMetaNode == nullptr) {
    return false;
  }

  KernelMD  kernelMD;
-  if (!GetAttrCodePropMetadata(*kernelMetaNode, KernargSegmentByteSize(), &kernelMD)) {
+  if (!GetAttrCodePropMetadata(*kernelMetaNode, &kernelMD)) {
    return false;
  }

+  // Set the kernel symbol name and size/alignment based on the kernel metadata
+  // NOTE: kernel name is used to get the kernel code handle in V2,
+  //       but kernel symbol name is used in V3
+  symbolName_ = (codeObjectVer() == 2) ? name() : kernelMD.mSymbolName;
+  workgroupGroupSegmentByteSize_ = kernelMD.mCodeProps.mGroupSegmentFixedSize;
+  workitemPrivateSegmentByteSize_ = kernelMD.mCodeProps.mPrivateSegmentFixedSize;
+  kernargSegmentByteSize_ = kernelMD.mCodeProps.mKernargSegmentSize;
+  kernargSegmentAlignment_ = amd::alignUp(std::max(kernelMD.mCodeProps.mKernargSegmentAlign, 128u),
+                                          dev().info().globalMemCacheLineSize_);
+
  // Set the workgroup information for the kernel
  workGroupInfo_.availableLDSSize_ = dev().info().localMemSizePerCU_;
  assert(workGroupInfo_.availableLDSSize_ > 0);

  // Get the available SGPRs and VGPRs
-  std::string targetIdent = std::string("amdgcn-amd-amdhsa--")+program_->machineTarget();
-  if (program_->xnackEnable()) {
+  std::string targetIdent = std::string("amdgcn-amd-amdhsa--")+program()->machineTarget();
+  if (program()->xnackEnable()) {
    targetIdent.append("+xnack");
  }
-  if (program_->sramEccEnable()) {
+  if (program()->sramEccEnable()) {
    targetIdent.append("+sram-ecc");
  }

@@ -67,10 +84,23 @@ bool LightningKernel::init() {
    return false;
  }

+  // Get the kernel code handle
+  hsa_status_t hsaStatus;
+  hsa_executable_symbol_t symbol;
+  hsa_agent_t agent = program()->hsaDevice();
+  hsaStatus = hsa_executable_get_symbol_by_name(program()->hsaExecutable(),
+                                                symbolName().c_str(),
+                                                &agent, &symbol);
+  if (hsaStatus == HSA_STATUS_SUCCESS) {
+    hsaStatus = hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT,
+                                               &kernelCodeHandle_);
+  }
+  if (hsaStatus != HSA_STATUS_SUCCESS) {
+    return false;
+  }
+
  if (!kernelMD.mAttrs.mRuntimeHandle.empty()) {
-    hsa_agent_t             agent = program_->hsaDevice();
    hsa_executable_symbol_t kernelSymbol;
-    hsa_status_t            hsaStatus;
    int                     variable_size;
    uint64_t                variable_address;

@@ -79,7 +109,7 @@ bool LightningKernel::init() {
    // object handle of such a kernel. The address of the variable and the kernel code object handle are known
    // only after the hsa executable is loaded. The below code copies the kernel code object handle to the
    // address of the variable.
-    hsaStatus = hsa_executable_get_symbol_by_name(program_->hsaExecutable(),
+    hsaStatus = hsa_executable_get_symbol_by_name(program()->hsaExecutable(),
                                                  kernelMD.mAttrs.mRuntimeHandle.c_str(),
                                                  &agent, &kernelSymbol);
    if (hsaStatus == HSA_STATUS_SUCCESS) {
@@ -87,11 +117,6 @@ bool LightningKernel::init() {
                                                 HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE,
                                                 &variable_size);
    }
-    if (hsaStatus == HSA_STATUS_SUCCESS) {
-      hsaStatus = hsa_executable_symbol_get_info(kernelSymbol,
-                                                 HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE,
-                                                 &variable_size);
-    }
    if (hsaStatus == HSA_STATUS_SUCCESS) {
      hsaStatus = hsa_executable_symbol_get_info(kernelSymbol,
                                                 HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
@@ -114,7 +139,7 @@ bool LightningKernel::init() {
  }

  uint32_t wavefront_size = 0;
-  if (hsa_agent_get_info(program_->hsaDevice(), HSA_AGENT_INFO_WAVEFRONT_SIZE, &wavefront_size) !=
+  if (hsa_agent_get_info(program()->hsaDevice(), HSA_AGENT_INFO_WAVEFRONT_SIZE, &wavefront_size) !=
      HSA_STATUS_SUCCESS) {
    return false;
  }
@@ -127,7 +152,7 @@ bool LightningKernel::init() {
  workGroupInfo_.usedSGPRs_ = kernelMD.mCodeProps.mNumSGPRs;
  workGroupInfo_.usedVGPRs_ = kernelMD.mCodeProps.mNumVGPRs;
  workGroupInfo_.usedStackSize_ = 0;
-  workGroupInfo_.wavefrontPerSIMD_ = program_->dev().info().maxWorkItemSizes_[0] / wavefront_size;
+  workGroupInfo_.wavefrontPerSIMD_ = program()->dev().info().maxWorkItemSizes_[0] / wavefront_size;
  workGroupInfo_.wavefrontSize_ = wavefront_size;
  workGroupInfo_.size_ = kernelMD.mCodeProps.mMaxFlatWorkGroupSize;
  if (workGroupInfo_.size_ == 0) {
@@ -135,7 +160,7 @@ bool LightningKernel::init() {
  }

  // handle the printf metadata if any
-  const amd_comgr_metadata_node_t* programMD = static_cast<LightningProgram*>(program_)->metadata();
+  const amd_comgr_metadata_node_t* programMD = static_cast<const LightningProgram*>(program())->metadata();
  assert(programMD != nullptr);

  std::vector<std::string> printfStr;
@@ -159,10 +184,10 @@ static const KernelMD* FindKernelMetadata(const CodeObjectMD* programMD, const s
 }

 bool LightningKernel::init() {
-  hsa_agent_t hsaDevice = program_->hsaDevice();
+  hsa_agent_t hsaDevice = program()->hsaDevice();

  // Pull out metadata from the ELF
-  const CodeObjectMD* programMD = static_cast<LightningProgram*>(program_)->metadata();
+  const CodeObjectMD* programMD = static_cast<const LightningProgram*>(program())->metadata();
  assert(programMD != nullptr);

  const KernelMD* kernelMD = FindKernelMetadata(programMD, name());
@@ -172,7 +197,7 @@ bool LightningKernel::init() {
  InitParameters(*kernelMD, KernargSegmentByteSize());

  // Set the workgroup information for the kernel
-  workGroupInfo_.availableLDSSize_ = program_->dev().info().localMemSizePerCU_;
+  workGroupInfo_.availableLDSSize_ = program()->dev().info().localMemSizePerCU_;
  assert(workGroupInfo_.availableLDSSize_ > 0);
  workGroupInfo_.availableSGPRs_ = 104;
  workGroupInfo_.availableVGPRs_ = 256;
@@ -196,7 +221,7 @@ bool LightningKernel::init() {
  }

  if (!kernelMD->mAttrs.mRuntimeHandle.empty()) {
-    hsa_agent_t             agent = program_->hsaDevice();
+    hsa_agent_t             agent = program()->hsaDevice();
    hsa_executable_symbol_t kernelSymbol;
    hsa_status_t            status;
    int                     variable_size;
@@ -208,7 +233,7 @@ bool LightningKernel::init() {
    // only after the hsa executable is loaded. The below code copies the kernel code object handle to the
    // address of the variable.

-    status = hsa_executable_get_symbol_by_name(program_->hsaExecutable(), kernelMD->mAttrs.mRuntimeHandle.c_str(),
+    status = hsa_executable_get_symbol_by_name(program()->hsaExecutable(), kernelMD->mAttrs.mRuntimeHandle.c_str(),
                                               &agent, &kernelSymbol);
    if (status != HSA_STATUS_SUCCESS) {
      return false;
@@ -239,7 +264,7 @@ bool LightningKernel::init() {
  }

  uint32_t wavefront_size = 0;
-  if (hsa_agent_get_info(program_->hsaDevice(), HSA_AGENT_INFO_WAVEFRONT_SIZE, &wavefront_size) !=
+  if (hsa_agent_get_info(program()->hsaDevice(), HSA_AGENT_INFO_WAVEFRONT_SIZE, &wavefront_size) !=
      HSA_STATUS_SUCCESS) {
    return false;
  }
@@ -258,7 +283,7 @@ bool LightningKernel::init() {

  workGroupInfo_.usedStackSize_ = 0;

-  workGroupInfo_.wavefrontPerSIMD_ = program_->dev().info().maxWorkItemSizes_[0] / wavefront_size;
+  workGroupInfo_.wavefrontPerSIMD_ = program()->dev().info().maxWorkItemSizes_[0] / wavefront_size;

  workGroupInfo_.wavefrontSize_ = wavefront_size;

@@ -278,18 +303,18 @@ bool LightningKernel::init() {
 bool HSAILKernel::init() {
  acl_error errorCode;
  // compile kernel down to ISA
-  hsa_agent_t hsaDevice = program_->hsaDevice();
+  hsa_agent_t hsaDevice = program()->hsaDevice();
  // Pull out metadata from the ELF
  size_t sizeOfArgList;
-  aclCompiler* compileHandle = program_->dev().compiler();
+  aclCompiler* compileHandle = program()->dev().compiler();
  std::string openClKernelName("&__OpenCL_" + name() + "_kernel");
-  errorCode = aclQueryInfo(compileHandle, program_->binaryElf(), RT_ARGUMENT_ARRAY,
+  errorCode = aclQueryInfo(compileHandle, program()->binaryElf(), RT_ARGUMENT_ARRAY,
                                         openClKernelName.c_str(), nullptr, &sizeOfArgList);
  if (errorCode != ACL_SUCCESS) {
    return false;
  }
  std::unique_ptr<char[]> argList(new char[sizeOfArgList]);
-  errorCode = aclQueryInfo(compileHandle, program_->binaryElf(), RT_ARGUMENT_ARRAY,
+  errorCode = aclQueryInfo(compileHandle, program()->binaryElf(), RT_ARGUMENT_ARRAY,
                                         openClKernelName.c_str(), argList.get(), &sizeOfArgList);
  if (errorCode != ACL_SUCCESS) {
    return false;
@@ -300,17 +325,17 @@ bool HSAILKernel::init() {

  // Set the workgroup information for the kernel
  memset(&workGroupInfo_, 0, sizeof(workGroupInfo_));
-  workGroupInfo_.availableLDSSize_ = program_->dev().info().localMemSizePerCU_;
+  workGroupInfo_.availableLDSSize_ = program()->dev().info().localMemSizePerCU_;
  assert(workGroupInfo_.availableLDSSize_ > 0);
  workGroupInfo_.availableSGPRs_ = 104;
  workGroupInfo_.availableVGPRs_ = 256;
  size_t sizeOfWorkGroupSize;
-  errorCode = aclQueryInfo(compileHandle, program_->binaryElf(), RT_WORK_GROUP_SIZE,
+  errorCode = aclQueryInfo(compileHandle, program()->binaryElf(), RT_WORK_GROUP_SIZE,
                                         openClKernelName.c_str(), nullptr, &sizeOfWorkGroupSize);
  if (errorCode != ACL_SUCCESS) {
    return false;
  }
-  errorCode = aclQueryInfo(compileHandle, program_->binaryElf(), RT_WORK_GROUP_SIZE,
+  errorCode = aclQueryInfo(compileHandle, program()->binaryElf(), RT_WORK_GROUP_SIZE,
                                         openClKernelName.c_str(), workGroupInfo_.compileSize_,
                                         &sizeOfWorkGroupSize);
  if (errorCode != ACL_SUCCESS) {
@@ -319,7 +344,7 @@ bool HSAILKernel::init() {

  uint32_t wavefront_size = 0;
  if (HSA_STATUS_SUCCESS !=
-      hsa_agent_get_info(program_->hsaDevice(), HSA_AGENT_INFO_WAVEFRONT_SIZE, &wavefront_size)) {
+      hsa_agent_get_info(program()->hsaDevice(), HSA_AGENT_INFO_WAVEFRONT_SIZE, &wavefront_size)) {
    return false;
  }
  assert(wavefront_size > 0);
@@ -344,18 +369,18 @@ bool HSAILKernel::init() {
  }

  workGroupInfo_.usedStackSize_ = 0;
-  workGroupInfo_.wavefrontPerSIMD_ = program_->dev().info().maxWorkItemSizes_[0] / wavefront_size;
+  workGroupInfo_.wavefrontPerSIMD_ = program()->dev().info().maxWorkItemSizes_[0] / wavefront_size;
  workGroupInfo_.wavefrontSize_ = wavefront_size;
  if (workGroupInfo_.compileSize_[0] != 0) {
    workGroupInfo_.size_ = workGroupInfo_.compileSize_[0] * workGroupInfo_.compileSize_[1] *
        workGroupInfo_.compileSize_[2];
  } else {
-    workGroupInfo_.size_ = program_->dev().info().preferredWorkGroupSize_;
+    workGroupInfo_.size_ = program()->dev().info().preferredWorkGroupSize_;
  }

  // Pull out printf metadata from the ELF
  size_t sizeOfPrintfList;
-  errorCode = aclQueryInfo(compileHandle, program_->binaryElf(), RT_GPU_PRINTF_ARRAY,
+  errorCode = aclQueryInfo(compileHandle, program()->binaryElf(), RT_GPU_PRINTF_ARRAY,
                                         openClKernelName.c_str(), nullptr, &sizeOfPrintfList);
  if (errorCode != ACL_SUCCESS) {
    return false;
@@ -367,7 +392,7 @@ bool HSAILKernel::init() {
    if (!aclPrintfList) {
      return false;
    }
-    errorCode = aclQueryInfo(compileHandle, program_->binaryElf(),
+    errorCode = aclQueryInfo(compileHandle, program()->binaryElf(),
                                           RT_GPU_PRINTF_ARRAY, openClKernelName.c_str(),
                                           aclPrintfList.get(), &sizeOfPrintfList);
    if (errorCode != ACL_SUCCESS) {
@@ -22,6 +22,8 @@ class Kernel : public device::Kernel {
         const uint32_t workitemPrivateSegmentByteSize, const uint32_t kernargSegmentByteSize,
         const uint32_t kernargSegmentAlignment);

+  Kernel(std::string name, Program* prog);
+
  const uint64_t& KernelCodeHandle() { return kernelCodeHandle_; }

  const uint32_t WorkgroupGroupSegmentByteSize() const { return workgroupGroupSegmentByteSize_; }
@@ -37,15 +39,15 @@ class Kernel : public device::Kernel {
  //! Initializes the metadata required for this kernel
  virtual bool init() = 0;

-  const Program* program() const { return static_cast<const Program*>(program_); }
+  const Program* program() const { return static_cast<const Program*>(&prog_); }

 protected:
-  Program* program_;                      //!< The roc::Program context
+//  Program* program_;                      //!< The roc::Program context
  uint64_t kernelCodeHandle_;             //!< Kernel code handle (aka amd_kernel_code_t)
-  const uint32_t workgroupGroupSegmentByteSize_;
-  const uint32_t workitemPrivateSegmentByteSize_;
-  const uint32_t kernargSegmentByteSize_;
-  const uint32_t kernargSegmentAlignment_;
+  uint32_t workgroupGroupSegmentByteSize_;
+  uint32_t workitemPrivateSegmentByteSize_;
+  uint32_t kernargSegmentByteSize_;
+  uint32_t kernargSegmentAlignment_;
  size_t kernelDirectiveOffset_;
 };

@@ -74,6 +76,10 @@ class LightningKernel : public roc::Kernel {
   : roc::Kernel(name, prog, kernelCodeHandle, workgroupGroupSegmentByteSize,
                 workitemPrivateSegmentByteSize, kernargSegmentByteSize, kernargSegmentAlignment) {
  }
+
+  LightningKernel(std::string name, Program* prog)
+   : roc::Kernel(name, prog) {}
+
  //! Initializes the metadata required for this kernel
  virtual bool init() final;
 };
@@ -488,6 +488,19 @@ bool LightningProgram::setKernels(amd::option::Options* options, void* binary, s
    return false;
  }

+#if defined(USE_COMGR_LIBRARY)
+  for (const auto &kernelMeta : kernelMetadataMap_) {
+    const std::string kernelName = kernelMeta.first;
+    Kernel* aKernel = new roc::LightningKernel(kernelName, this);
+    if (!aKernel->init()) {
+      return false;
+    }
+    aKernel->setUniformWorkGroupSize(options->oVariables->UniformWorkGroupSize);
+    aKernel->setInternalKernelFlag(compileOptions_.find("-cl-internal-kernel") !=
+                                   std::string::npos);
+    kernels()[kernelName] = aKernel;
+  }
+#else
  // Get the list of kernels
  std::vector<std::string> kernelNameList;
  status = hsa_executable_iterate_agent_symbols(hsaExecutable_, agent, GetKernelNamesCallback,
@@ -582,6 +595,7 @@ bool LightningProgram::setKernels(amd::option::Options* options, void* binary, s
                                   std::string::npos);
    kernels()[kernelName] = aKernel;
  }
+#endif // defined(USE_COMGR_LIBRARY)
 #endif  // defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY)
  return true;
 }