//
// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
//
#include "device/pal/palkernel.hpp"
#include "device/pal/palprogram.hpp"
#include "device/pal/palblit.hpp"
#include "device/pal/palconstbuf.hpp"
#include "device/pal/palsched.hpp"
#include "platform/commandqueue.hpp"
#include "utils/options.hpp"
#include "acl.h"

#if defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY)
#include "llvm/Support/AMDGPUMetadata.h"

typedef llvm::AMDGPU::HSAMD::Kernel::Metadata KernelMD;
#endif  // defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY)

#include <string>
#include <memory>
#include <fstream>
#include <sstream>
#include <iostream>
#include <ctime>
#include <algorithm>

namespace pal {

void HSAILKernel::setWorkGroupInfo(const uint32_t privateSegmentSize,
                                   const uint32_t groupSegmentSize, const uint16_t numSGPRs,
                                   const uint16_t numVGPRs) {
  workGroupInfo_.scratchRegs_ = amd::alignUp(privateSegmentSize, 16) / sizeof(uint32_t);
  // Make sure runtime matches HW alignment, which is 256 scratch regs (DWORDs) per wave
  constexpr uint32_t ScratchRegAlignment = 256;
  workGroupInfo_.scratchRegs_ =
      amd::alignUp((workGroupInfo_.scratchRegs_ * dev().info().wavefrontWidth_),
                   ScratchRegAlignment) / dev().info().wavefrontWidth_;
  workGroupInfo_.privateMemSize_ = workGroupInfo_.scratchRegs_ * sizeof(uint32_t);
  workGroupInfo_.localMemSize_ = workGroupInfo_.usedLDSSize_ = groupSegmentSize;
  workGroupInfo_.usedSGPRs_ = numSGPRs;
  workGroupInfo_.usedStackSize_ = 0;
  workGroupInfo_.usedVGPRs_ = numVGPRs;

  if (!prog().isNull()) {
    workGroupInfo_.availableLDSSize_ = dev().properties().gfxipProperties.shaderCore.ldsSizePerCu;
    workGroupInfo_.availableSGPRs_ =
        dev().properties().gfxipProperties.shaderCore.numAvailableSgprs;
    workGroupInfo_.availableVGPRs_ =
        dev().properties().gfxipProperties.shaderCore.numAvailableVgprs;
    workGroupInfo_.preferredSizeMultiple_ = workGroupInfo_.wavefrontPerSIMD_ =
        dev().info().wavefrontWidth_;
  } else {
    workGroupInfo_.availableLDSSize_ = 64 * Ki;
    workGroupInfo_.availableSGPRs_ = 104;
    workGroupInfo_.availableVGPRs_ = 256;
    workGroupInfo_.preferredSizeMultiple_ = workGroupInfo_.wavefrontPerSIMD_ = 64;
  }
}

bool HSAILKernel::setKernelCode(amd::hsa::loader::Symbol* sym, amd_kernel_code_t* akc) {
  if (!sym) {
    return false;
  }
  if (!sym->GetInfo(HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, reinterpret_cast<void*>(&code_))) {
    return false;
  }

  // Copy code object of this kernel from the program CPU segment
  memcpy(akc, reinterpret_cast<void*>(prog().findHostKernelAddress(code_)),
         sizeof(amd_kernel_code_t));

  return true;
}

bool HSAILKernel::aqlCreateHWInfo(amd::hsa::loader::Symbol* sym) {
  amd_kernel_code_t* akc = &akc_;

  if (!setKernelCode(sym, akc)) {
    return false;
  }

  if (!sym->GetInfo(HSA_EXT_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT_SIZE,
                    reinterpret_cast<void*>(&codeSize_))) {
    return false;
  }

  // Setup the the workgroup info
  setWorkGroupInfo(akc->workitem_private_segment_byte_size, akc->workgroup_group_segment_byte_size,
                   akc->wavefront_sgpr_count, akc->workitem_vgpr_count);

  workgroupGroupSegmentByteSize_ = workGroupInfo_.usedLDSSize_;
  kernargSegmentByteSize_ = akc->kernarg_segment_byte_size;

  return true;
}

HSAILKernel::HSAILKernel(std::string name, HSAILProgram* prog, std::string compileOptions)
    : device::Kernel(prog->dev(), name, *prog),
      compileOptions_(compileOptions),
      index_(0),
      code_(0),
      codeSize_(0),
      workgroupGroupSegmentByteSize_(0),
      kernargSegmentByteSize_(0) {
  flags_.hsa_ = true;
}

HSAILKernel::~HSAILKernel() {}

bool HSAILKernel::init(amd::hsa::loader::Symbol* sym, bool finalize) {
#if defined(WITH_COMPILER_LIB)
  acl_error error = ACL_SUCCESS;
  std::string openClKernelName = openclMangledName(name());
  flags_.internalKernel_ =
      (compileOptions_.find("-cl-internal-kernel") != std::string::npos) ? true : false;
  // compile kernel down to ISA
  if (finalize) {
    std::string options(compileOptions_.c_str());
    options.append(" -just-kernel=");
    options.append(openClKernelName.c_str());
    // Append an option so that we can selectively enable a SCOption on CZ
    // whenever IOMMUv2 is enabled.
    if (dev().settings().svmFineGrainSystem_) {
      options.append(" -sc-xnack-iommu");
    }
    error = aclCompile(dev().compiler(), prog().binaryElf(), options.c_str(), ACL_TYPE_CG,
                       ACL_TYPE_ISA, nullptr);
    buildLog_ += aclGetCompilerLog(dev().compiler());
    if (error != ACL_SUCCESS) {
      LogError("Failed to finalize kernel");
      return false;
    }
  }

  aqlCreateHWInfo(sym);

  // Pull out metadata from the ELF
  size_t sizeOfArgList;
  error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_ARGUMENT_ARRAY,
                       openClKernelName.c_str(), nullptr, &sizeOfArgList);
  if (error != ACL_SUCCESS) {
    return false;
  }

  char* aclArgList = new char[sizeOfArgList];
  if (nullptr == aclArgList) {
    return false;
  }
  error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_ARGUMENT_ARRAY,
                       openClKernelName.c_str(), aclArgList, &sizeOfArgList);
  if (error != ACL_SUCCESS) {
    return false;
  }
  // Set the argList
  InitParameters(reinterpret_cast<const aclArgData*>(aclArgList), argsBufferSize());
  delete[] aclArgList;

  size_t sizeOfWorkGroupSize;
  error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_WORK_GROUP_SIZE,
                       openClKernelName.c_str(), nullptr, &sizeOfWorkGroupSize);
  if (error != ACL_SUCCESS) {
    return false;
  }
  error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_WORK_GROUP_SIZE,
                       openClKernelName.c_str(), workGroupInfo_.compileSize_, &sizeOfWorkGroupSize);
  if (error != ACL_SUCCESS) {
    return false;
  }

  // Copy wavefront size
  workGroupInfo_.wavefrontSize_ = dev().info().wavefrontWidth_;
  // Find total workgroup size
  if (workGroupInfo_.compileSize_[0] != 0) {
    workGroupInfo_.size_ = workGroupInfo_.compileSize_[0] * workGroupInfo_.compileSize_[1] *
        workGroupInfo_.compileSize_[2];
  } else {
    workGroupInfo_.size_ = dev().info().preferredWorkGroupSize_;
  }

  // Pull out printf metadata from the ELF
  size_t sizeOfPrintfList;
  error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_GPU_PRINTF_ARRAY,
                       openClKernelName.c_str(), nullptr, &sizeOfPrintfList);
  if (error != ACL_SUCCESS) {
    return false;
  }

  // Make sure kernel has any printf info
  if (0 != sizeOfPrintfList) {
    char* aclPrintfList = new char[sizeOfPrintfList];
    if (nullptr == aclPrintfList) {
      return false;
    }
    error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_GPU_PRINTF_ARRAY,
                         openClKernelName.c_str(), aclPrintfList, &sizeOfPrintfList);
    if (error != ACL_SUCCESS) {
      return false;
    }

    // Set the PrintfList
    InitPrintf(reinterpret_cast<aclPrintfFmt*>(aclPrintfList));
    delete[] aclPrintfList;
  }

  aclMetadata md;
  md.enqueue_kernel = false;
  size_t sizeOfDeviceEnqueue = sizeof(md.enqueue_kernel);
  error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_DEVICE_ENQUEUE,
                       openClKernelName.c_str(), &md.enqueue_kernel, &sizeOfDeviceEnqueue);
  if (error != ACL_SUCCESS) {
    return false;
  }
  flags_.dynamicParallelism_ = md.enqueue_kernel;

  md.kernel_index = -1;
  size_t sizeOfIndex = sizeof(md.kernel_index);
  error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_KERNEL_INDEX,
                       openClKernelName.c_str(), &md.kernel_index, &sizeOfIndex);
  if (error != ACL_SUCCESS) {
    return false;
  }
  index_ = md.kernel_index;

  size_t sizeOfWavesPerSimdHint = sizeof(workGroupInfo_.wavesPerSimdHint_);
  error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_WAVES_PER_SIMD_HINT,
                       openClKernelName.c_str(), &workGroupInfo_.wavesPerSimdHint_,
                       &sizeOfWavesPerSimdHint);
  if (error != ACL_SUCCESS) {
    return false;
  }

  waveLimiter_.enable();

  size_t sizeOfWorkGroupSizeHint = sizeof(workGroupInfo_.compileSizeHint_);
  error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_WORK_GROUP_SIZE_HINT,
                       openClKernelName.c_str(), workGroupInfo_.compileSizeHint_,
                       &sizeOfWorkGroupSizeHint);
  if (error != ACL_SUCCESS) {
    return false;
  }

  size_t sizeOfVecTypeHint;
  error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_VEC_TYPE_HINT,
                       openClKernelName.c_str(), NULL, &sizeOfVecTypeHint);
  if (error != ACL_SUCCESS) {
    return false;
  }

  if (0 != sizeOfVecTypeHint) {
    char* VecTypeHint = new char[sizeOfVecTypeHint + 1];
    if (NULL == VecTypeHint) {
      return false;
    }
    error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_VEC_TYPE_HINT,
                         openClKernelName.c_str(), VecTypeHint, &sizeOfVecTypeHint);
    if (error != ACL_SUCCESS) {
      return false;
    }
    VecTypeHint[sizeOfVecTypeHint] = '\0';
    workGroupInfo_.compileVecTypeHint_ = std::string(VecTypeHint);
    delete[] VecTypeHint;
  }

#endif  // defined(WITH_COMPILER_LIB)
  return true;
}

const Device& HSAILKernel::dev() const { return reinterpret_cast<const Device&>(dev_); }

const HSAILProgram& HSAILKernel::prog() const {
  return reinterpret_cast<const HSAILProgram&>(prog_);
}

hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments(VirtualGPU& gpu, const amd::Kernel& kernel,
                                                         const amd::NDRangeContainer& sizes,
                                                         const_address params,
                                                         size_t ldsAddress, uint64_t vmDefQueue,
                                                         uint64_t* vmParentWrap) const {
  const_address parameters = params;
  uint64_t argList;
  address aqlArgBuf = gpu.managedBuffer().reserve(
      argsBufferSize() + sizeof(hsa_kernel_dispatch_packet_t), &argList);
  gpu.addVmMemory(gpu.managedBuffer().activeMemory());

  if (dynamicParallelism()) {
    // Provide the host parent AQL wrap object to the kernel
    AmdAqlWrap wrap = {};
    wrap.state = AQL_WRAP_BUSY;
    *vmParentWrap = gpu.cb(1)->UploadDataToHw(&wrap, sizeof(AmdAqlWrap));
    gpu.addVmMemory(gpu.cb(1)->ActiveMemory());
  }

  // The check below handles a special case of single context with multiple devices
  // when the devices use different compilers(HSAIL and LC) and have different signatures
  const amd::KernelSignature& signature =
    (this->signature().version() == kernel.signature().version()) ?
    kernel.signature() : this->signature();

  // If signatures don't match, then patch the parameters
  if (signature.version() != kernel.signature().version()) {
    WriteAqlArgAt(aqlArgBuf, parameters, signature.paramsSize() - signature.at(0).offset_,
                  signature.at(0).offset_);
    parameters = aqlArgBuf;
  }

  // Check if runtime has to setup hidden arguments
  for (uint32_t i = signature.numParameters(); i < signature.numParametersAll(); ++i) {
    const auto it = signature.at(i);
    size_t offset;
    switch (it.info_.oclObject_) {
      case amd::KernelParameterDescriptor::HiddenNone:
        // void* zero = 0;
        // WriteAqlArgAt(const_cast<address>(parameters), &zero, it.size_, it.offset_);
        break;
      case amd::KernelParameterDescriptor::HiddenGlobalOffsetX:
        offset = sizes.offset()[0];
        WriteAqlArgAt(const_cast<address>(parameters), &offset, it.size_, it.offset_);
        break;
      case amd::KernelParameterDescriptor::HiddenGlobalOffsetY:
        if (sizes.dimensions() >= 2) {
          offset = sizes.offset()[1];
          WriteAqlArgAt(const_cast<address>(parameters), &offset, it.size_, it.offset_);
        }
        break;
      case amd::KernelParameterDescriptor::HiddenGlobalOffsetZ:
        if (sizes.dimensions() >= 3) {
          offset = sizes.offset()[2];
          WriteAqlArgAt(const_cast<address>(parameters), &offset, it.size_, it.offset_);
        }
        break;
      case amd::KernelParameterDescriptor::HiddenPrintfBuffer:
        if ((printfInfo().size() > 0) &&
            // and printf buffer was allocated
            (gpu.printfDbgHSA().dbgBuffer() != nullptr)) {
          // and set the fourth argument as the printf_buffer pointer
          size_t bufferPtr = static_cast<size_t>(gpu.printfDbgHSA().dbgBuffer()->vmAddress());
          gpu.addVmMemory(gpu.printfDbgHSA().dbgBuffer());
          WriteAqlArgAt(const_cast<address>(parameters), &bufferPtr, it.size_, it.offset_);
        }
        break;
      case amd::KernelParameterDescriptor::HiddenDefaultQueue:
        if (vmDefQueue != 0) {
          WriteAqlArgAt(const_cast<address>(parameters), &vmDefQueue, it.size_, it.offset_);
        }
        break;
      case amd::KernelParameterDescriptor::HiddenCompletionAction:
        if (*vmParentWrap != 0) {
          WriteAqlArgAt(const_cast<address>(parameters), vmParentWrap, it.size_, it.offset_);
        }
        break;
      case amd::KernelParameterDescriptor::HiddenMultiGridSync:
        break;
    }
  }

  // Load all kernel arguments
  if (signature.version() == kernel.signature().version()) {
    WriteAqlArgAt(aqlArgBuf, parameters, argsBufferSize(), 0);
  }

  // Note: In a case of structs the size won't match,
  // since HSAIL compiler expects a reference...
  assert(argsBufferSize() <= signature.paramsSize() &&
         "A mismatch of sizes of arguments between compiler and runtime!");

  // hsa_kernel_dispatch_packet_t disp;
  hsa_kernel_dispatch_packet_t* hsaDisp =
      reinterpret_cast<hsa_kernel_dispatch_packet_t*>(gpu.cb(0)->SysMemCopy());

  amd::NDRange local(sizes.local());
  const amd::NDRange& global = sizes.global();

  // Check if runtime has to find local workgroup size
  FindLocalWorkSize(sizes.dimensions(), sizes.global(), local);

  constexpr uint16_t kDispatchPacketHeader =
      (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
      (1 << HSA_PACKET_HEADER_BARRIER) |
      (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
      (HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);

  hsaDisp->header = kDispatchPacketHeader;
  hsaDisp->setup = sizes.dimensions();

  hsaDisp->workgroup_size_x = local[0];
  hsaDisp->workgroup_size_y = (sizes.dimensions() > 1) ? local[1] : 1;
  hsaDisp->workgroup_size_z = (sizes.dimensions() > 2) ? local[2] : 1;

  hsaDisp->grid_size_x = global[0];
  hsaDisp->grid_size_y = (sizes.dimensions() > 1) ? global[1] : 1;
  hsaDisp->grid_size_z = (sizes.dimensions() > 2) ? global[2] : 1;
  hsaDisp->reserved2 = 0;

  // Initialize kernel ISA and execution buffer requirements
  hsaDisp->private_segment_size = spillSegSize();
  hsaDisp->group_segment_size = ldsAddress;
  hsaDisp->kernel_object = gpuAqlCode();

  hsaDisp->kernarg_address = reinterpret_cast<void*>(argList);
  hsaDisp->reserved2 = 0;
  hsaDisp->completion_signal.handle = 0;
  memcpy(aqlArgBuf + argsBufferSize(), hsaDisp, sizeof(hsa_kernel_dispatch_packet_t));

  if (AMD_HSA_BITS_GET(akc_.kernel_code_properties,
                       AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_QUEUE_PTR)) {
    gpu.addVmMemory(gpu.hsaQueueMem());
  }

  return hsaDisp;
}

const LightningProgram& LightningKernel::prog() const {
  return reinterpret_cast<const LightningProgram&>(prog_);
}

#if defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY)
static const KernelMD* FindKernelMetadata(const CodeObjectMD* programMD, const std::string& name) {
  for (const KernelMD& kernelMD : programMD->mKernels) {
    if (kernelMD.mName == name) {
      return &kernelMD;
    }
  }
  return nullptr;
}
#endif  // defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY)

#if defined(USE_COMGR_LIBRARY)
bool LightningKernel::init() {
  flags_.internalKernel_ =
      (compileOptions_.find("-cl-internal-kernel") != std::string::npos) ? true : false;

  const amd_comgr_metadata_node_t* kernelMetaNode = prog().getKernelMetadata(name());
  if (kernelMetaNode == nullptr) {
    return false;
  }

  KernelMD kernelMD;
  if (!GetAttrCodePropMetadata(*kernelMetaNode, &kernelMD)) {
    return false;
  }

  symbolName_ = (codeObjectVer() == 2) ? name() : kernelMD.mSymbolName;

  workgroupGroupSegmentByteSize_ = kernelMD.mCodeProps.mGroupSegmentFixedSize;
  kernargSegmentByteSize_ = kernelMD.mCodeProps.mKernargSegmentSize;

  // Copy codeobject of this kernel from the program CPU segment
  hsa_agent_t agent;
  agent.handle = 1;

  auto sym = prog().GetSymbol(symbolName().c_str(), const_cast<hsa_agent_t*>(&agent));

  if (!setKernelCode(sym, &akc_)) {
    return false;
  }

  if (!prog().isNull()) {
    codeSize_ = prog().codeSegGpu().owner()->getSize();

    // handle device enqueue
    if (!kernelMD.mAttrs.mRuntimeHandle.empty()) {
      hsa_agent_t agent;
      agent.handle = 1;
      amd::hsa::loader::Symbol* rth_symbol;

      // Get the runtime handle symbol GPU address
      rth_symbol = prog().GetSymbol(const_cast<char*>(kernelMD.mAttrs.mRuntimeHandle.c_str()),
                                    const_cast<hsa_agent_t*>(&agent));
      uint64_t symbol_address;
      rth_symbol->GetInfo(HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, &symbol_address);

      // Copy the kernel_object pointer to the runtime handle symbol GPU address
      const Memory& codeSegGpu = prog().codeSegGpu();
      uint64_t offset = symbol_address - codeSegGpu.vmAddress();
      uint64_t kernel_object = gpuAqlCode();
      VirtualGPU* gpu = codeSegGpu.dev().xferQueue();

      const struct RuntimeHandle runtime_handle = {gpuAqlCode(), spillSegSize(), ldsSize()};

      codeSegGpu.writeRawData(*gpu, offset, sizeof(runtime_handle), &runtime_handle, true);
    }
  }

  // Setup the the workgroup info
  setWorkGroupInfo(kernelMD.mCodeProps.mPrivateSegmentFixedSize,
                   kernelMD.mCodeProps.mGroupSegmentFixedSize, kernelMD.mCodeProps.mNumSGPRs,
                   kernelMD.mCodeProps.mNumVGPRs);

  // Copy wavefront size
  workGroupInfo_.wavefrontSize_ = dev().info().wavefrontWidth_;

  workGroupInfo_.size_ = kernelMD.mCodeProps.mMaxFlatWorkGroupSize;
  if (workGroupInfo_.size_ == 0) {
    return false;
  }

  // handle the printf metadata if any
  std::vector<std::string> printfStr;
  if (!GetPrintfStr(&printfStr)) {
    return false;
  }

  if (!printfStr.empty()) {
    InitPrintf(printfStr);
  }

  return true;
}
#endif  // defined(USE_COMGR_LIBRARY)

bool LightningKernel::init(amd::hsa::loader::Symbol* symbol) {
#if defined(WITH_LIGHTNING_COMPILER) && !defined(USE_COMGR_LIBRARY)
  flags_.internalKernel_ =
      (compileOptions_.find("-cl-internal-kernel") != std::string::npos) ? true : false;

  aqlCreateHWInfo(symbol);

  const CodeObjectMD* programMD = prog().metadata();
  assert(programMD != nullptr);

  const KernelMD* kernelMD = FindKernelMetadata(programMD, name());

  if (kernelMD == nullptr) {
    return false;
  }

  // Set the argList
  InitParameters(*kernelMD, argsBufferSize());

  if (!kernelMD->mAttrs.mReqdWorkGroupSize.empty()) {
    const auto& requiredWorkgroupSize = kernelMD->mAttrs.mReqdWorkGroupSize;
    workGroupInfo_.compileSize_[0] = requiredWorkgroupSize[0];
    workGroupInfo_.compileSize_[1] = requiredWorkgroupSize[1];
    workGroupInfo_.compileSize_[2] = requiredWorkgroupSize[2];
  }

  if (!kernelMD->mAttrs.mWorkGroupSizeHint.empty()) {
    const auto& workgroupSizeHint = kernelMD->mAttrs.mWorkGroupSizeHint;
    workGroupInfo_.compileSizeHint_[0] = workgroupSizeHint[0];
    workGroupInfo_.compileSizeHint_[1] = workgroupSizeHint[1];
    workGroupInfo_.compileSizeHint_[2] = workgroupSizeHint[2];
  }

  if (!kernelMD->mAttrs.mVecTypeHint.empty()) {
    workGroupInfo_.compileVecTypeHint_ = kernelMD->mAttrs.mVecTypeHint.c_str();
  }

  if (!kernelMD->mAttrs.mRuntimeHandle.empty()) {
    hsa_agent_t agent;
    agent.handle = 1;
    amd::hsa::loader::Symbol* rth_symbol;

    // Get the runtime handle symbol GPU address
    rth_symbol = prog().GetSymbol(const_cast<char*>(kernelMD->mAttrs.mRuntimeHandle.c_str()),
                                  const_cast<hsa_agent_t*>(&agent));
    uint64_t symbol_address;
    rth_symbol->GetInfo(HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, &symbol_address);

    // Copy the kernel_object pointer to the runtime handle symbol GPU address
    const Memory& codeSegGpu = prog().codeSegGpu();
    uint64_t offset = symbol_address - codeSegGpu.vmAddress();
    VirtualGPU* gpu = codeSegGpu.dev().xferQueue();

    const struct RuntimeHandle runtime_handle = {gpuAqlCode(), spillSegSize(), ldsSize()};

    codeSegGpu.writeRawData(*gpu, offset, sizeof(runtime_handle), &runtime_handle, true);
  }

  // Copy wavefront size
  workGroupInfo_.wavefrontSize_ = dev().info().wavefrontWidth_;

  workGroupInfo_.size_ = kernelMD->mCodeProps.mMaxFlatWorkGroupSize;
  if (workGroupInfo_.size_ == 0) {
    return false;
  }

  InitPrintf(programMD->mPrintf);

  /*FIXME_lmoriche:
  size_t sizeOfWavesPerSimdHint = sizeof(workGroupInfo_.wavesPerSimdHint_);
  error = aclQueryInfo(dev().compiler(), prog().binaryElf(),
      RT_WAVES_PER_SIMD_HINT, openClKernelName.c_str(),
      &workGroupInfo_.wavesPerSimdHint_, &sizeOfWavesPerSimdHint);
  if (error != ACL_SUCCESS) {
      return false;
  }

  waveLimiter_.enable();
  */
#endif  // defined(WITH_LIGHTNING_COMPILER) && ! defined(USE_COMGR_LIBRARY)
  return true;
}

}  // namespace pal