rocm-systems/rocclr/runtime/device/gpu/gpuprogram.cpp

//
// Copyright (c) 2008 Advanced Micro Devices, Inc. All rights reserved.
//

#include "os/os.hpp"
#include "utils/flags.hpp"
#include "include/aclTypes.h"
#include "utils/bif_section_labels.hpp"
#include "device/gpu/gpuprogram.hpp"
#include "device/gpu/gpublit.hpp"
#include "macrodata.h"
#include "MDParser/AMDILMDInterface.h"
#include <fstream>
#include <sstream>
#include <cstdio>
#include "utils/options.hpp"
#include "newcore.h"

extern "C" bool
ACL_API_ENTRY aclHsaLoader(
    aclCompiler* compiler_handle,
    aclBinary* bin,
    void* userData,
    void (*allocateGPUMemory)(void* userData, size_t size, uint64_t* GPUMemory),
    bool (*DmaMemoryCopy)(void* userData, uint64_t offset, const void* pSrc, size_t size),
    void (*getSamplerObjectParam)(uint32_t* size, uint32_t* alignment),
    void (*initializeSamplerObject)(void* userData, uint64_t offset, bool unnormalize,
    uint8_t fltr, uint8_t addrU, uint8_t addrV, uint8_t addrW));

bool
DmaMemoryCopy(void* userData, uint64_t offset, const void* pSrc, size_t size)
{
    gpu::HSAILProgram* prog = reinterpret_cast<gpu::HSAILProgram*>(userData);
    gpu::Memory* mem = const_cast<gpu::Memory*>(prog->globalStore());
    if (mem == NULL) {
        return false;
    }
    size_t maxCopySize = prog->globalVariableTotalSize();
    if (maxCopySize >= size) {
        maxCopySize = size;
    }
    amd::Coord3D origin(offset);
    amd::Coord3D region(maxCopySize);
    // memcpy mode
    if (pSrc) {
        const bool Entire  = true;
        return prog->dev().xferMgr().writeBuffer(pSrc, *mem, origin, region, Entire);
    }
    // memset mode
    else {
        char pattern = 0;
        return prog->dev().xferMgr().fillBuffer(*mem, &pattern, sizeof(pattern),
        origin, region);
    }
}

void
AllocateGPUMemory(void* userData, size_t size, uint64_t* GPUMemory)
{
    gpu::Memory* mem = NULL;
    void*   cpuPtr = NULL;
    gpu::HSAILProgram* prog = reinterpret_cast<gpu::HSAILProgram*>(userData);

    mem = new gpu::Memory(prog->dev(), amd::alignUp(size, gpu::ConstBuffer::VectorSize));

    // Initialize constant buffer
    if ((mem == NULL) || !mem->create(gpu::Resource::Local)) {
        delete mem;
        *GPUMemory = 0;
        return;
    }
    *GPUMemory = mem->vmAddress();
    prog->setGlobalStore(mem);
    prog->setGlobalVariableTotalSize(size);
}

void
GetSamplerObjectParams(uint32_t* size, uint32_t* alignment)
{
    if (GPU_DIRECT_SRD) {
        *size = HSA_SAMPLER_OBJECT_SIZE;
        *alignment = HSA_SAMPLER_OBJECT_ALIGNMENT;
    }
    else {
        *size = sizeof(uint64_t);
        *alignment = sizeof(uint64_t);
    }
}

void
InitializeSamplerObject(void* userData, uint64_t offset, bool unnormalize,
    uint8_t fltr, uint8_t addrU, uint8_t addrV, uint8_t addrW)
{
    assert((addrU == addrV && addrV == addrW) && "GSL supports single address mode");
    HsaSamplerFilterType filter = static_cast<HsaSamplerFilterType>(fltr);
    HsaSamplerAddressMode boundaryU = static_cast<HsaSamplerAddressMode>(addrU);

    uint32_t    state = (unnormalize) ?
        amd::Sampler::StateNormalizedCoordsFalse : amd::Sampler::StateNormalizedCoordsTrue;
    if (filter == HSA_SAMP_FILTER_NEAREST) {
        state |= amd::Sampler::StateFilterNearest;
    }
    else if (filter == HSA_SAMP_FILTER_LINEAR) {
        state |= amd::Sampler::StateFilterLinear;
    }
    switch (boundaryU) {
    case HSA_SAMP_ADDRESS_CLAMPEDGE:
        state |= amd::Sampler::StateAddressClampToEdge;
        break;
    case HSA_SAMP_ADDRESS_CLAMPBORDER:
        state |= amd::Sampler::StateAddressClamp;
        break;
    case HSA_SAMP_ADDRESS_WRAP:
        state |= amd::Sampler::StateAddressRepeat;
        break;
    case HSA_SAMP_ADDRESS_MIRROR:
        state |= amd::Sampler::StateAddressMirroredRepeat;
        break;
    case HSA_SAMP_ADDRESS_MIRRORONCE:
    case HSA_SAMP_ADDRESS_NONE:
    default:
        break;
    }

    gpu::HSAILProgram* prog = reinterpret_cast<gpu::HSAILProgram*>(userData);
    if (prog->dev().settings().hsailDirectSRD_) {
        char *pCPUbuf = new char[HSA_SAMPLER_OBJECT_SIZE];
        if (!pCPUbuf) {
          assert(false);
          return;
        }
        prog->dev().fillHwSampler(state, pCPUbuf, HSA_SAMPLER_OBJECT_SIZE);
        DmaMemoryCopy(userData, offset, pCPUbuf, HSA_SAMPLER_OBJECT_SIZE);
        delete pCPUbuf;
    }
    else {
        gpu::Sampler* sampler = new gpu::Sampler(prog->dev());
        if ((sampler != NULL) && sampler->create(state)) {
            uint64_t    hwSrd = sampler->hwSrd();
            DmaMemoryCopy(userData, offset, &hwSrd, sizeof(uint64_t));
            prog->addSampler(sampler);
        }
    }
    return;
}

namespace gpu {

bool
NullProgram::initBuild(amd::option::Options* options)
{
    if (!device::Program::initBuild(options)) {
        return false;
    }

    const char* devname = dev().hwInfo()->machineTarget_;
    options->setPerBuildInfo(
        (devname && (devname[0] != '\0')) ? devname : "gpu",
        clBinary()->getEncryptCode(),
        true // FIXME: the dev ptr is used to query the wavefront size.
        );

    // Elf Binary setup
    std::string outFileName;

    // Recompile from IL may happen (invoking Kernel::recompil()) to generate correct
    // isa code for 7xx. Because of this, force saving AMDIL into the binary.
    clBinary()->init(options, (dev().calTarget() <= CAL_TARGET_730));
    if (options->isDumpFlagSet(amd::option::DUMP_BIF)) {
      outFileName = options->getDumpFileName(".bin");
    }

    bool useELF64 = dev().settings().use64BitPtr_;
    if (!clBinary()->setElfOut(useELF64 ? ELFCLASS64 : ELFCLASS32,
                               (outFileName.size() > 0) ? outFileName.c_str() : NULL)) {
        LogError("Setup elf out for gpu failed");
        return false;
    }
    return true;
}

bool
NullProgram::finiBuild(bool isBuildGood)
{
    clBinary()->resetElfOut();
    clBinary()->resetElfIn();

    if (!isBuildGood) {
        // Prevent the encrypted binary form leaking out
        clBinary()->setBinary(NULL, 0);
    }

    return device::Program::finiBuild(isBuildGood);
}

const aclTargetInfo &
NullProgram::info(const char * str) {
    acl_error err;
    std::string arch = GPU_TARGET_INFO_ARCH;
    if (dev().settings().use64BitPtr_) {
      arch += "64";
    }
    info_ = aclGetTargetInfo(arch.c_str(), ( str && str[0] == '\0' ? dev().hwInfo()->targetName_ : str ), &err);
    if (err != ACL_SUCCESS) {
        LogWarning("aclGetTargetInfo failed");
    }
    return info_;
}

NullProgram::~NullProgram()
{
    // Destroy all ILFunc objects
    freeAllILFuncs();
    releaseClBinary();
}

bool
NullProgram::isCalled(const ILFunc* base, const ILFunc* func)
{
    // Loop through all functions, which will be called from the base one
    for (size_t i = 0; i < base->calls_.size(); ++i) {
        assert(base->calls_[i] != base && "recursion");
        // Check if the current function is the one
        if (base->calls_[i] == func) {
            return true;
        }
        // We have to use a recursive method to make sure it's not called inside
        else if (isCalled(base->calls_[i], func)) {
            return true;
        }
    }
    return false;
}

void
NullProgram::patchMain(std::string& kernel, uint index)
{
    std::string callPatch = "call ";
    char        sym;

    // Create the patch string
    while (index) {
        sym     = (index % 10) + 0x30;
        callPatch.insert(5, &sym, 1);
        index   /=  10;
    }
    callPatch += ";";

    // Patch the program
    kernel.replace(patch_, callPatch.size(), callPatch);
}

NullKernel*
Program::createKernel(
    const std::string&  name, const Kernel::InitData* initData,
    const std::string&  code, const std::string&  metadata, bool* created,
    const void* binaryCode, size_t binarySize)
{
    amd::option::Options *options = getCompilerOptions();
    uint64_t start_time = 0;
    if (options->oVariables->EnableBuildTiming) {
        start_time = amd::Os::timeNanos();
    }

    *created = false;
    // Create a GPU kernel
    Kernel* gpuKernel = new Kernel(name,
        static_cast<const gpu::Device&>(device()), *this, initData);

    if (gpuKernel == NULL) {
        buildLog_ += "new Kernel() failed";
        LogPrintfError("new Kernel() failed for kernel %s!", name.c_str());
        return NULL;
    }
    else if (gpuKernel->create(code, metadata, binaryCode, binarySize)) {
        // Add kernel to the program
        kernels()[gpuKernel->name()] = gpuKernel;
        buildLog_ += gpuKernel->buildLog();
    }
    else {
        buildError_ = gpuKernel->buildError();
        buildLog_ += gpuKernel->buildLog();
        delete gpuKernel;
        LogPrintfError("Kernel creation failed for kernel %s!", name.c_str());
        return NULL;
    }

    if (options->oVariables->EnableBuildTiming) {
        std::stringstream tmp_ss;
        tmp_ss << "    Time for creating kernel ("
               << name << ") : "
               << (amd::Os::timeNanos() - start_time)/1000ULL
               << " us\n";
        buildLog_ += tmp_ss.str();
    }

    *created = true;
    return static_cast<NullKernel*>(gpuKernel);
}

bool
NullProgram::linkImpl(amd::option::Options* options)
{
    if (llvmBinary_.empty()) {
        // We are using either CL binary or IL directly.
        bool hasRecompiled;
        if (ilProgram_.empty()) {
            // Setup elfIn() and try to load ISA from binary
            // This elfIn() will be released at the end of build by finiBuild().
            if (!clBinary()->setElfIn(ELFCLASS32)) {
                buildLog_ += "Internal error: Setting input OCL binary failed!\n";
                LogError("Setting input OCL binary failed");
                return false;
            }
            bool loadSuccess = false;
            if (!options->oVariables->ForceLLVM) {
                loadSuccess = loadBinary(&hasRecompiled);
            }
            if (!loadSuccess &&
                (options->oVariables->UseDebugIL &&
                 !options->oVariables->ForceLLVM)) {
                buildLog_ += "Internal error: Loading OpenCL binary under -use-debugil failed!\n";
                LogError("Loading OCL binary failed under -use-debugil");
                return false;
            }
            if (loadSuccess) {
                if (hasRecompiled) {
                    char *section;
                    size_t sz;
                    if (clBinary()->saveSOURCE() &&
                        clBinary()->elfIn()->getSection(amd::OclElf::SOURCE, &section, &sz)) {
                        clBinary()->elfOut()->addSection(amd::OclElf::SOURCE, section, sz);
                    }
                    if (clBinary()->saveLLVMIR()) {
                        if (clBinary()->loadLlvmBinary(llvmBinary_, llvmBinaryIsSpir_) && (!llvmBinary_.empty())) {
                            clBinary()->elfOut()->addSection(llvmBinaryIsSpir_?amd::OclElf::SPIR:amd::OclElf::LLVMIR,
                                                  llvmBinary_.data(), llvmBinary_.size(), false);
                        }
                    }

                    setType(TYPE_EXECUTABLE);
                    if (!clBinary()->createElfBinary(options->oVariables->BinEncrypt, type())) {
                        buildLog_ += "Internal error: Failed to create OpenCL binary!\n";
                        LogError("Failed to create OpenCL binary");
                        return false;
                    }
                }
                else {
                    // The original binary is good and reuse it.
                    // Release the new binary if there is.
                    clBinary()->restoreOrigBinary();
                }
                return true;
            }
            else if (clBinary()->loadLlvmBinary(llvmBinary_, llvmBinaryIsSpir_) &&
                     clBinary()->isRecompilable(llvmBinary_, amd::OclElf::CAL_PLATFORM)) {
                char *section;
                size_t sz;

                // Clean up and remove all the content generated before
                if (!clBinary()->clearElfOut()) {
                    buildLog_ += "Internal error: Resetting OpenCL Binary failed!\n";
                    LogError("Resetting output OCL binary failed");
                    return false;
                }

                if (clBinary()->saveSOURCE() &&
                    clBinary()->elfIn()->getSection(amd::OclElf::SOURCE, &section, &sz)) {
                    clBinary()->elfOut()->addSection(amd::OclElf::SOURCE, section, sz);
                }
                if (clBinary()->saveLLVMIR()) {
                    clBinary()->elfOut()->addSection(llvmBinaryIsSpir_?amd::OclElf::SPIR:amd::OclElf::LLVMIR,
                                         llvmBinary_.data(), llvmBinary_.size(), false);
                }
            }
            else {
                buildLog_ += "Internal error: Input OpenCL binary is not for the target!\n";
                LogError("OCL Binary isn't good for the target");
                return false;
            }
        }
    }

    if (!llvmBinary_.empty()) {
        // Compile llvm binary to the IL source code
        // This is link/OPT/Codegen part of compiler.
        cl_int iErr = compileBinaryToIL(options);
        if (iErr != CL_SUCCESS) {
            buildLog_ += "Error: Compilation from LLVMIR binary to IL text failed!";
            LogError(buildLog_.c_str());
            return false;
        }
    }

    if (!ilProgram_.empty() && options->oVariables->EnableDebug) {
        // Lets parse out the dwarf debug information and store it in the elf
        llvm::CompUnit compilation(ilProgram_);
        std::string debugILStr = compilation.getILStr();
        const char* dbgSec = debugILStr.c_str();
        size_t dbgSize = debugILStr.size();
        // Add an IL section that contains debug information and is the
        // output of LLVM codegen.
        clBinary()->elfOut()->addSection(amd::OclElf::ILDEBUG, dbgSec, dbgSize);

        if ((dbgSize > 0) && options->isDumpFlagSet(amd::option::DUMP_DEBUGIL)) {
            std::string debugilWithLine;
            size_t b = 1;
            size_t e;
            int linenum=0;
            char cstr[9];
            cstr[8] = 0;
            while (b != std::string::npos) {
                e = debugILStr.find_first_of("\n", b);
                if (e != std::string::npos) {
                    ++e;
                }
                sprintf(&cstr[0], "%5x:  ", linenum);
                debugilWithLine.append(cstr);
                debugilWithLine.append(debugILStr.substr(b,e-b));
                b = e;
                ++linenum;
            }
            std::string debugilFileName = options->getDumpFileName(".debugil");
            std::fstream f;
            f.open(debugilFileName.c_str(), (std::fstream::out | std::fstream::binary));
            f.write(debugilWithLine.c_str(), debugilWithLine.size());
            f.close();
        }

        for (unsigned x = 0; x < llvm::AMDILDwarf::DEBUG_LAST; ++x) {
            dbgSec = compilation.getDebugData()->getDwarfBitstream(
                    static_cast<llvm::AMDILDwarf::DwarfSection>(x), dbgSize);
            // Do not create an elf section if the size of the section is
            // 0.
            if (!dbgSize) {
                continue;
            }
            clBinary()->elfOut()->addSection(
                    static_cast<amd::OclElf::oclElfSections>(x
                        + amd::OclElf::DEBUG_INFO), dbgSec, dbgSize);
        }

    }

    // Create kernel objects
    if (!ilProgram_.empty() && parseKernels(ilProgram_)) {
        // Loop through all possible kernels
        for (size_t i = 0; i < funcs_.size(); ++i) {
            ILFunc* baseFunc = funcs_[i];
            // Make sure we have a Kernel function, but not Intrinsic or Simple
            if (baseFunc->state_ == ILFunc::Kernel) {
                size_t      metadataSize =
                    baseFunc->metadata_.end_ - baseFunc->metadata_.begin_;
                std::string             kernel = ilProgram_;
                std::string             metadataStr;
                std::vector<ILFunc*>    notCalled;
                std::vector<ILFunc*>    called;
                std::map<int, const char**>   macros;
                size_t  j;
                Kernel::InitData    initData = {0};

                // Fill the list of not used functions, relativly to the current
                for (j = 0; j < funcs_.size(); ++j) {
                    if ((i != j) &&
                        ((funcs_[j]->state_ == ILFunc::Regular) ||
                         (funcs_[j]->state_ == ILFunc::Kernel))) {
                        if (!isCalled(baseFunc, funcs_[j])) {
                            notCalled.push_back(funcs_[j]);
                        }
                        else {
                            called.push_back(funcs_[j]);
                        }
                    }
                }

                // Get the metadata string for the current kernel
                metadataStr.insert(0, kernel,
                    baseFunc->metadata_.begin_, metadataSize);

                std::vector<ILFunc::SourceRange*> rangeList;
                // Remove unused kernels, starting from the end
                for (j = notCalled.size(); j > 0; --j) {
                    ILFunc* func = notCalled[j-1];
                    std::vector<ILFunc::SourceRange*>::iterator it;
                    for (it = rangeList.begin(); it != rangeList.end(); ++it) {
                        if ((*it)->begin_ < func->metadata_.begin_) {
                            assert((*it)->begin_ < func->code_.begin_
                                && "code and metadata not next to each other");
                            break;
                        }
                        assert((*it)->begin_ >= func->code_.begin_
                            && "code and metadata not next to each other");
                    }
                    assert(func->metadata_.begin_ > func->code_.begin_
                        && "code after metadata");
                    if (it == rangeList.end()) {
                        rangeList.push_back(&func->metadata_);
                        rangeList.push_back(&func->code_);
                    }
                    else {
                        it = rangeList.insert(it, &func->code_);
                        rangeList.insert(it, &func->metadata_);
                    }
                }
                for (j = 0; j < rangeList.size(); ++j) {
                    const ILFunc::SourceRange* range = rangeList[j];
                    kernel.erase(range->begin_, range->end_ - range->begin_);
                }

                // Patch the main program with a call to the current kernel
                patchMain(kernel, baseFunc->index_);

                // Add macros at the top, loop through all available functions
                // for this kernel
                for (j = 0; j <= called.size(); ++j) {
                    ILFunc* func = (j < called.size()) ? called[j] : baseFunc;
                    for (size_t l = func->macros_.size(); l > 0 ; --l) {
                        int lines;
                        int idx = static_cast<int>(func->macros_[l - 1]);
                        const char** macro = amd::MacroDBGetMacro(&lines, idx);

                        // Make sure we didn't place this macro already
                        if (macros[idx] == NULL) {
                            macros[idx] = macro;
                            // Do we have a valid macro?
                            if ((lines == 0) || (macro == NULL)) {
                                buildLog_ += "Error: undefined macro!\n";
                                LogPrintfError(
                                    "Metadata reports undefined macro %d!", idx);
                                return false;
                            }
                            else {
                                // Add the macro to the IL source
                                for (int k = 0; k < lines; ++k) {
                                    kernel.insert(0, macro[k], strlen(macro[k]));
                                }
                            }
                        }
                    }
                    // Accumulate all emulated local and private sizes,
                    // necessary for the kernel execution
                    initData.localSize_   += func->localSize_;
                    initData.privateSize_ += func->privateSize_;

                    // Accumulate all HW local and private sizes,
                    // necessary for the kernel execution
                    initData.hwLocalSize_   += func->hwLocalSize_;
                    initData.hwPrivateSize_ += func->hwPrivateSize_;
                    initData.flags_         |= func->flags_;
                }

                // Create a GPU kernel
                bool    created;
                NullKernel* gpuKernel = createKernel(baseFunc->name_,
                    &initData, kernel.data(), metadataStr, &created);
                if (!created) {
                    buildLog_ += "Error: Creating kernel " +
                        baseFunc->name_ + " failed!\n";
                    LogError(buildLog_.c_str());
                    return false;
                }

                // Add the current kernel to the binary
                if (!clBinary()->storeKernel(baseFunc->name_, gpuKernel,
                    &initData, metadataStr, kernel)) {
                    buildLog_ += "Internal error: adding a kernel into OpenCL binary failed!\n";
                    return false;
                }
            }
            else {
                // Non-kernel function, save metadata symbols for recompilation
                if (clBinary()->saveAMDIL()) {
                    size_t      metadataSize =
                        baseFunc->metadata_.end_ - baseFunc->metadata_.begin_;
                    if (metadataSize <= 0) {
                        continue;
                    }
                    std::string metadataStr;
                    // Get the metadata string
                    metadataStr.insert(0, ilProgram_, baseFunc->metadata_.begin_,
                        metadataSize);

                    std::stringstream  aStream;
                    aStream << "__OpenCL_" << baseFunc->name_ << "_fmetadata";
                    std::string metaName = aStream.str();
                    // Save metadata symbols in .rodata
                    if (!clBinary()->elfOut()->addSymbol(amd::OclElf::RODATA,
                                                        metaName.c_str(),
                                                        metadataStr.data(),
                                                        metadataStr.size())) {
                        buildLog_ += "Internal error: addSymbol failed!\n";
                        LogError ("AddSymbol failed");
                        return false;
                    }
                }
            }
        }

        setType(TYPE_EXECUTABLE);
        if (!createBinary(options)) {
            buildLog_ += "Intenral error: creating OpenCL binary failed\n";
            return false;
        }

        // Destroy all ILFunc objects
        freeAllILFuncs();
        ilProgram_.clear();
        return true;
    }
    return false;
}

bool
NullProgram::linkImpl(const std::vector<device::Program*>& inputPrograms,
                      amd::option::Options* options,
                      bool createLibrary)
{
    std::vector<std::string*> llvmBinaries(inputPrograms.size());
    std::vector<bool> llvmBinaryIsSpir(inputPrograms.size());
    std::vector<device::Program*>::const_iterator it
        = inputPrograms.begin();
    std::vector<device::Program*>::const_iterator itEnd
        = inputPrograms.end();
    for (size_t i = 0; it != itEnd; ++it, ++i) {
        NullProgram* program = (NullProgram*)*it;

        if (program->llvmBinary_.empty()) {
            if (program->clBinary() == NULL) {
                buildLog_ += "Internal error: Input program not compiled!\n";
                LogError("Loading compiled input object failed");
                return false;
            }

            // We are using CL binary directly.
            // Setup elfIn() and try to load llvmIR from binary
            // This elfIn() will be released at the end of build by finiBuild().
            if (!program->clBinary()->setElfIn(ELFCLASS32)) {
                buildLog_ += "Internal error: Setting input OCL binary failed!\n";
                LogError("Setting input OCL binary failed");
                return false;
            }
            if (!program->clBinary()->loadLlvmBinary(program->llvmBinary_,
                    program->llvmBinaryIsSpir_)) {
                buildLog_
                    += "Internal error: Failed loading compiled binary!\n";
                LogError("Bad OCL Binary");
                return false;
            }

            if (!program->clBinary()->isRecompilable(program->llvmBinary_,
                                                 amd::OclElf::CAL_PLATFORM)) {
                buildLog_ += "Internal error: Input OpenCL binary is not"
                             " for the target!\n";
                LogError("OCL Binary isn't good for the target");
                return false;
            }
#if 0
                // TODO: copy .source over to output program
                char *section;
                size_t sz;

                if (clBinary()->saveSOURCE() &&
                    clBinary()->elfIn()->getSection(amd::OclElf::SOURCE, &section, &sz)) {
                    clBinary()->elfOut()->addSection(amd::OclElf::SOURCE, section, sz);
                }
#endif
        }

        llvmBinaries[i] = &program->llvmBinary_;
        llvmBinaryIsSpir[i] = program->llvmBinaryIsSpir_;
    }

    acl_error err;
    aclTargetInfo aclinfo = info();
    aclBinaryOptions binOpts = {0};
    binOpts.struct_size = sizeof(binOpts);
    binOpts.elfclass = aclinfo.arch_id == aclAMDIL64 ? ELFCLASS64 : ELFCLASS32;
    binOpts.bitness = ELFDATA2LSB;
    binOpts.alloc = &::malloc;
    binOpts.dealloc = &::free;

    std::vector<aclBinary*> libs(llvmBinaries.size(), NULL);
    for (size_t i = 0; i < libs.size(); ++i) {
        libs[i] = aclBinaryInit(sizeof(aclBinary), &aclinfo, &binOpts, &err);
        if (err != ACL_SUCCESS) {
            LogWarning("aclBinaryInit failed");
            break;
        }

        err = aclInsertSection(dev().compiler(), libs[i],
            llvmBinaries[i]->data(), llvmBinaries[i]->size(),
            llvmBinaryIsSpir[i]?aclSPIR:aclLLVMIR);
        if (err != ACL_SUCCESS) {
            LogWarning("aclInsertSection failed");
            break;
        }

        // temporary solution to synchronize buildNo between runtime and complib
        // until we move runtime inside complib
        ((amd::option::Options*)libs[i]->options)->setBuildNo(
            options->getBuildNo());
        }


    if (libs.size() > 0 && err == ACL_SUCCESS) do {
        unsigned int numLibs = libs.size() - 1;
        if (numLibs > 0) {
            err = aclLink(dev().compiler(), libs[0], numLibs, &libs[1],
                ACL_TYPE_LLVMIR_BINARY, "-create-library", NULL);

            buildLog_ += aclGetCompilerLog(dev().compiler());

            if (err != ACL_SUCCESS) {
                LogWarning("aclLink failed");
                break;
            }
        }

        size_t size = 0;
        const void* llvmir = aclExtractSection(dev().compiler(), libs[0],
            &size, aclLLVMIR, &err);
        if (err != ACL_SUCCESS) {
            LogWarning("aclExtractSection failed");
            break;
        }

        llvmBinary_.assign(reinterpret_cast<const char*>(llvmir), size);
        llvmBinaryIsSpir_ = false;
    } while(0);

    std::for_each(libs.begin(), libs.end(), std::ptr_fun(aclBinaryFini));

    if (err != ACL_SUCCESS) {
        buildLog_ += "Error: linking llvm modules failed!";
        return false;
    }

    if (clBinary()->saveLLVMIR()) {
        clBinary()->elfOut()->addSection(amd::OclElf::LLVMIR,
                                        llvmBinary_.data(), llvmBinary_.size(),
                                       false);
        // store the original link options
        clBinary()->storeLinkOptions(linkOptions_);

        clBinary()->storeCompileOptions(compileOptions_);
    }

    // skip the rest if we are building an opencl library
    if (createLibrary) {
        setType(TYPE_LIBRARY);
        if (!createBinary(options)) {
            buildLog_ += "Intenral error: creating OpenCL binary failed\n";
            return false;
        }

        return true;
    }

    // Compile llvm binary to the IL source code
    // This is link/OPT/Codegen part of compiler.
    cl_int iErr = compileBinaryToIL(options);
    if (iErr != CL_SUCCESS) {
        buildLog_ += "Error: Compilation from LLVMIR binary to IL text failed!";
        LogError(buildLog_.c_str());
        return false;
    }

    if (!ilProgram_.empty() && options->oVariables->EnableDebug) {
        // Lets parse out the dwarf debug information and store it in the elf
        llvm::CompUnit compilation(ilProgram_);
        std::string debugILStr = compilation.getILStr();
        const char* dbgSec = debugILStr.c_str();
        size_t dbgSize = debugILStr.size();
        // Add an IL section that contains debug information and is the
        // output of LLVM codegen.
        clBinary()->elfOut()->addSection(amd::OclElf::ILDEBUG, dbgSec, dbgSize);

        if ((dbgSize > 0) && options->isDumpFlagSet(amd::option::DUMP_DEBUGIL)) {
            std::string debugilWithLine;
            size_t b = 1;
            size_t e;
            int linenum=0;
            char cstr[9];
            cstr[8] = 0;
            while (b != std::string::npos) {
                e = debugILStr.find_first_of("\n", b);
                if (e != std::string::npos) {
                    ++e;
                }
                sprintf(&cstr[0], "%5x:  ", linenum);
                debugilWithLine.append(cstr);
                debugilWithLine.append(debugILStr.substr(b,e-b));
                b = e;
                ++linenum;
            }
            std::string debugilFileName = options->getDumpFileName(".debugil");
            std::fstream f;
            f.open(debugilFileName.c_str(), (std::fstream::out | std::fstream::binary));
            f.write(debugilWithLine.c_str(), debugilWithLine.size());
            f.close();
        }

        for (unsigned x = 0; x < llvm::AMDILDwarf::DEBUG_LAST; ++x) {
            dbgSec = compilation.getDebugData()->getDwarfBitstream(
                    static_cast<llvm::AMDILDwarf::DwarfSection>(x), dbgSize);
            // Do not create an elf section if the size of the section is
            // 0.
            if (!dbgSize) {
                continue;
            }
            clBinary()->elfOut()->addSection(
                    static_cast<amd::OclElf::oclElfSections>(x
                        + amd::OclElf::DEBUG_INFO), dbgSec, dbgSize);
        }

    }

    // Create kernel objects
    if (!ilProgram_.empty() && parseKernels(ilProgram_)) {
        // Loop through all possible kernels
        for (size_t i = 0; i < funcs_.size(); ++i) {
            ILFunc* baseFunc = funcs_[i];
            // Make sure we have a Kernel function, but not Intrinsic or Simple
            if (baseFunc->state_ == ILFunc::Kernel) {
                size_t      metadataSize =
                    baseFunc->metadata_.end_ - baseFunc->metadata_.begin_;
                std::string             kernel = ilProgram_;
                std::string             metadataStr;
                std::vector<ILFunc*>    notCalled;
                std::vector<ILFunc*>    called;
                std::map<int, const char**>   macros;
                size_t  j;
                Kernel::InitData    initData = {0};

                // Fill the list of not used functions, relativly to the current
                for (j = 0; j < funcs_.size(); ++j) {
                    if ((i != j) &&
                        ((funcs_[j]->state_ == ILFunc::Regular) ||
                         (funcs_[j]->state_ == ILFunc::Kernel))) {
                        if (!isCalled(baseFunc, funcs_[j])) {
                            notCalled.push_back(funcs_[j]);
                        }
                        else {
                            called.push_back(funcs_[j]);
                        }
                    }
                }

                // Get the metadata string for the current kernel
                metadataStr.insert(0, kernel,
                    baseFunc->metadata_.begin_, metadataSize);

                std::vector<ILFunc::SourceRange*> rangeList;
                // Remove unused kernels, starting from the end
                for (j = notCalled.size(); j > 0; --j) {
                    ILFunc* func = notCalled[j-1];
                    std::vector<ILFunc::SourceRange*>::iterator it;
                    for (it = rangeList.begin(); it != rangeList.end(); ++it) {
                        if ((*it)->begin_ < func->metadata_.begin_) {
                            assert((*it)->begin_ < func->code_.begin_
                                && "code and metadata not next to each other");
                            break;
                        }
                        assert((*it)->begin_ >= func->code_.begin_
                            && "code and metadata not next to each other");
                    }
                    assert(func->metadata_.begin_ > func->code_.begin_
                           && "code after metadata");
                    if (it == rangeList.end()) {
                        rangeList.push_back(&func->metadata_);
                        rangeList.push_back(&func->code_);
                    }
                    else {
                        it = rangeList.insert(it, &func->code_);
                        rangeList.insert(it, &func->metadata_);
                    }
                }
                for (j = 0; j < rangeList.size(); ++j) {
                    const ILFunc::SourceRange* range = rangeList[j];
                    kernel.erase(range->begin_, range->end_ - range->begin_);
                }

                // Patch the main program with a call to the current kernel
                patchMain(kernel, baseFunc->index_);

                // Add macros at the top, loop through all available functions
                // for this kernel
                for (j = 0; j <= called.size(); ++j) {
                    ILFunc* func = (j < called.size()) ? called[j] : baseFunc;
                    for (size_t l = func->macros_.size(); l > 0 ; --l) {
                        int lines;
                        int idx = static_cast<int>(func->macros_[l - 1]);
                        const char** macro = amd::MacroDBGetMacro(&lines, idx);

                        // Make sure we didn't place this macro already
                        if (macros[idx] == NULL) {
                            macros[idx] = macro;
                            // Do we have a valid macro?
                            if ((lines == 0) || (macro == NULL)) {
                                buildLog_ += "Error: undefined macro!\n";
                                LogPrintfError(
                                    "Metadata reports undefined macro %d!", idx);
                                return false;
                            }
                            else {
                                // Add the macro to the IL source
                                for (int k = 0; k < lines; ++k) {
                                    kernel.insert(0, macro[k], strlen(macro[k]));
                                }
                            }
                        }
                    }
                    // Accumulate all emulated local and private sizes,
                    // necessary for the kernel execution
                    initData.localSize_   += func->localSize_;
                    initData.privateSize_ += func->privateSize_;

                    // Accumulate all HW local and private sizes,
                    // necessary for the kernel execution
                    initData.hwLocalSize_   += func->hwLocalSize_;
                    initData.hwPrivateSize_ += func->hwPrivateSize_;
                    initData.flags_         |= func->flags_;
                }

                // Create a GPU kernel
                bool    created;
                NullKernel* gpuKernel = createKernel(baseFunc->name_,
                    &initData, kernel.data(), metadataStr, &created);
                if (!created) {
                    buildLog_ += "Error: Creating kernel " +
                        baseFunc->name_ + " failed!\n";
                    LogError(buildLog_.c_str());
                    return false;
                }

                // Add the current kernel to the binary
                if (!clBinary()->storeKernel(baseFunc->name_, gpuKernel,
                    &initData, metadataStr, kernel)) {
                    buildLog_ += "Internal error: adding a kernel into OpenCL binary failed!\n";
                    return false;
                }
            }
            else {
                // Non-kernel function, save metadata symbols for recompilation
                if (clBinary()->saveAMDIL()) {
                    size_t      metadataSize =
                        baseFunc->metadata_.end_ - baseFunc->metadata_.begin_;
                    if (metadataSize <= 0) {
                        continue;
                    }
                    std::string metadataStr;
                    // Get the metadata string
                    metadataStr.insert(0, ilProgram_, baseFunc->metadata_.begin_,
                        metadataSize);

                    std::stringstream  aStream;
                    aStream << "__OpenCL_" << baseFunc->name_ << "_fmetadata";
                    std::string metaName = aStream.str();
                    // Save metadata symbols in .rodata
                    if (!clBinary()->elfOut()->addSymbol(amd::OclElf::RODATA,
                                                        metaName.c_str(),
                                                        metadataStr.data(),
                                                        metadataStr.size())) {
                        buildLog_ += "Internal error: addSymbol failed!\n";
                        LogError ("AddSymbol failed");
                        return false;
                    }
                }
            }
        }

        setType(TYPE_EXECUTABLE);
        if (!createBinary(options)) {
            buildLog_ += "Intenral error: creating OpenCL binary failed\n";
            return false;
        }

        // Destroy all ILFunc objects
        freeAllILFuncs();
        ilProgram_.clear();
        return true;
    }
    return false;
}

bool
NullProgram::initClBinary()
{
    if (clBinary_ == NULL) {
        clBinary_ = new ClBinary(static_cast<const Device&>(device()));
        if (clBinary_ == NULL) {
            return false;
        }
    }
    return true;
}

void
NullProgram::releaseClBinary()
{
    if (clBinary_ != NULL) {
        delete clBinary_;
        clBinary_ = NULL;
    }
}

bool
NullProgram::loadBinary(bool* hasRecompiled)
{
    if (!clBinary()->loadKernels(*this, hasRecompiled)) {
        clear();
        return false;
    }
    return true;
}

bool
NullProgram::initGlobalData(const std::string& source, size_t start)
{
    size_t  pos, dataStart;

    // Find the global data store
    dataStart= source.find(";#DATASTART", start);
    if (dataStart!= std::string::npos) {
        uint    index = 0;
        pos = dataStart + 2;
        while (expect(source, &pos, "DATASTART:")) {
            uint    dataSize = 0;
            uint    offset;
            uint    numElements;
            size_t  posStart;
            bool failed = false;

            // Kernel has the global constants
            if (!getuint(source, &pos, &index)) {
                return false;
            }
            pos--;
            if (expect(source, &pos, ":")) {
                // Read the size
                if (!getuint(source, &pos, &dataSize)) {
                    return false;
                }
            }
            else {
                // Emulated global data store
                pos++;
                dataSize = index;
                index = 0;
            }

            if (dataSize == 0) {
                return false;
            }

            posStart = pos = source.find_first_not_of(";# \n\r", pos);

            char* globalData = new char[dataSize];
            if (globalData == NULL) {
                return false;
            }

            // Find the global data size
            while (!expect(source, &pos, "DATAEND")) {
                for (uint i = 0; i < DataTypeTotal; ++i) {
                    if (expect(source, &pos, DataType[i].tagName_)) {
                        // Read the offset
                        if (!getuint(source, &pos, &offset)) {
                            return false;
                        }
                        if (!getuint(source, &pos, &numElements)) {
                            return false;
                        }
                        for (uint j = 0; j < numElements; ++j) {
                            switch (DataType[i].type_) {
                            case KernelArg::Float: {
                                uint32_t* tmp = reinterpret_cast<uint32_t*>(globalData + offset);
                                if (!getuintHex(source, &pos, &tmp[j])) {
                                    failed = true;
                                }
                            }
                                break;
                            case KernelArg::Double: {
                                uint64_t* tmp = reinterpret_cast<uint64_t*>(globalData + offset);
                                if (!getuint64Hex(source, &pos, &tmp[j])) {
                                    failed = true;
                                }
                            }
                                break;
                            case KernelArg::Struct:
                            case KernelArg::Union:
                                // Struct and Union should be presented as bytes
                                // Fall through...
                            case KernelArg::Char: {
                                uint8_t* tmp = reinterpret_cast<uint8_t*>(globalData + offset);
                                uint    value;
                                if (!getuintHex(source, &pos, &value)) {
                                    failed = true;
                                }
                                tmp[j] = static_cast<uint8_t>(value);
                            }
                                break;
                            case KernelArg::Short: {
                                uint16_t* tmp = reinterpret_cast<uint16_t*>(globalData + offset);
                                uint    value;
                                if (!getuintHex(source, &pos, &value)) {
                                    failed = true;
                                }
                                tmp[j] = static_cast<uint16_t>(value);
                            }
                                break;
                            case KernelArg::Int:
                            case KernelArg::UInt: {
                                uint32_t* tmp = reinterpret_cast<uint32_t*>(globalData + offset);
                                if (!getuintHex(source, &pos, &tmp[j])) {
                                    failed = true;
                                }
                            }
                                break;
                            case KernelArg::Long:
                            case KernelArg::ULong: {
                                uint64_t* tmp = reinterpret_cast<uint64_t*>(globalData + offset);
                                if (!getuint64Hex(source, &pos, &tmp[j])) {
                                    failed = true;
                                }
                            }
                                break;
                            case KernelArg::None:
                            default:
                                break;
                            }
                            if (failed) {
                                delete [] globalData;
                                return false;
                            }
                        }
                        break;
                    }
                }
                if (posStart == pos) {
                    delete [] globalData;
                    return false;
                }
                posStart = pos = source.find_first_not_of(";# \n\r", pos);
            }

            if (!allocGlobalData(globalData, dataSize, index)) {
                failed = true;
            }

            if (!clBinary()->storeGlobalData(globalData, dataSize, index)) {
                failed = true;
            }

            delete [] globalData;

            // Erase the global store information
            if (index != 0) {
                if (expect(source, &pos, ":")) {
                    // Read the size
                    if (!getuint(source, &pos, &index)) {
                        return false;
                    }
                }
            }
            pos = source.find_first_not_of(";# \n\r", pos);
            (const_cast<std::string&>(source)).erase(dataStart, pos - dataStart);
            pos = dataStart;
            if (failed) {
                return false;
            }
        }
    }

    return true;
}

bool
NullProgram::findILFuncs(const std::string& source,
                         const std::string &func_start,
                         const std::string &func_end,
                         size_t& lastFuncPos)
{
    lastFuncPos = 0;

    // Find first tag
    size_t  pos = source.find(func_start);

    // Loop through all provided program arguments
    while (pos != std::string::npos) {
        char    funcName[256];
        ILFunc  func;

        func.code_.begin_ = pos;
        if (!expect(source, &pos, func_start)) {
            break;
        }

        pos = source.find_first_not_of(" \n\r", pos);
        // Read the function index
        if (!getuint(source, &pos, &func.index_)) {
            LogError("Error reading function index");
            return false;
        }

        pos = source.find_first_of(";\n\r", pos);
        if (source[pos] == '\r' || source[pos] == '\n') {
            // this is the dummy macro
            func.name_ = std::string("");
        }
        else {
            pos = source.find_first_not_of("; \n\r", pos);
            // Read the function's name
            if (!getword(source, &pos, funcName)) {
                LogError("Error reading function name");
                return false;
            }
            func.name_ = funcName;
        }

        // Find the function end
        pos = source.find(func_end, pos);
        if (!expect(source, &pos, func_end)) {
            break;
        }
        if (source[pos] == '\r' || source[pos] == '\n') {
            if (!func.name_.empty()) {
                LogError("Missing function name");
                return false;
            }
        }
        else {
            // this is the dummy macro
            pos = source.find_first_not_of("; \n\r", pos);
            if (!expect(source, &pos, funcName)) {
                LogError("Error reading function name");
                return false;
            }
        }
        // Save the function end
        func.code_.end_ = pos;

        if (!func.name_.empty()) {
            // Create a new function
            ILFunc*  clFunc = new ILFunc(func);
            if (clFunc != NULL) {
                addFunc(clFunc);
            }
            else {
                return false;
            }
        }
        lastFuncPos = pos;
        // Next function
        pos = source.find(func_start, pos);
    }

    return true;
}

bool
NullProgram::findAllILFuncs(const std::string& source, size_t& lastFuncPos)
{
    // find all functions defined using "func"
    size_t lastPos1;
    bool ret = findILFuncs(source, "func ", "endfunc ", lastPos1);
    if (!ret) return false;

    // find all functions defined using outlined macro
    size_t lastPos2;
    ret = findILFuncs(source, "mdef(", "mend", lastPos2);
    if (!ret) return false;

    lastFuncPos = std::max(lastPos1, lastPos2);
    return true;
}

bool
NullProgram::parseAllILFuncs(const std::string& source)
{
    bool doPatch = true;
    amd::option::Options *opts = getCompilerOptions();
    if (opts->isCStrEqual(opts->oVariables->XLang, "il")) {
        doPatch = false;
    }
    // Find the patch position
    if (doPatch) {
        patch_ = source.find(";$$$$$$$$$$");
        if (patch_ == std::string::npos) {
            return false;
        }
    }

    size_t lastFuncPos = 0;
    if (!findAllILFuncs(source, lastFuncPos)) {
        return false;
    }

    // Initialize the global data if available
    if (!initGlobalData(source, lastFuncPos)) {
        LogError("We failed the global constants detection/initialization!");
        return false;
    }

    return true;
}

bool
NullProgram::parseFuncMetadata(const std::string& source, size_t posBegin, size_t posEnd)
{
    ILFunc* baseFunc = NULL;
    uint    index;
    size_t  pos = posBegin;
    while (pos < posEnd) {
        if (!expect(source, &pos, ";")) {
            break;
        }
        for (uint k = 0; k < DescTotal; ++k) {
            uint    funcIndex;
            uint    j;

            if (expect(source, &pos, ArgState[k].typeName_)) {
                if (ArgState[k].type_ == KernelArg::ErrorMessage) {
                    // Next argument
                    size_t posNext = source.find(";", pos);
                    buildLog_.append("Error:");
                    buildLog_.append(source.substr(pos, posNext - pos));
                    return false;
                }
                else if (ArgState[k].type_ == KernelArg::WarningMessage) {
                    // Next argument
                    size_t posNext = source.find(";", pos);
                    buildLog_.append("Warning:");
                    buildLog_.append(source.substr(pos, posNext - pos));
                    continue;
                }
                else if (ArgState[k].type_ == KernelArg::PrivateFixed) {
                    baseFunc->flags_ |= Kernel::PrivateFixed;
                    continue;
                }
                else if (ArgState[k].type_ == KernelArg::ABI64Bit) {
                    baseFunc->flags_ |= Kernel::ABI64bit;
                    continue;
                }
                else if (ArgState[k].type_ == KernelArg::Wavefront) {
                    baseFunc->flags_ |= Kernel::LimitWorkgroup;
                    continue;
                }
                else if (ArgState[k].type_ == KernelArg::PrintfFormatStr) {
                    uint    tmp;
                    uint    arguments;
                    PrintfInfo  info;

                    // Read index
                    if (!getuint(source, &pos, &index)) {
                        return false;
                    }
                    if (printf_.size() <= index) {
                        printf_.resize(index + 1);
                    }
                    // Read the number of arguments
                    if (!getuint(source, &pos, &arguments)) {
                        return false;
                    }
                    for (uint j = 0; j < arguments; ++j) {
                        // Read the argument's size in bytes
                        if (!getuint(source, &pos, &tmp)) {
                            return false;
                        }
                        info.arguments_.push_back(tmp);
                    }

                    // Read length
                    if (!getuint(source, &pos, &tmp)) {
                        return false;
                    }
                    // Read string (uses length so all possible chars are valid)
                    for (size_t i = 0; i < tmp; ++i) {
                        char symbol = source[pos++];
                        if (symbol == '\\') {
                            switch (source[pos]) {
                            case 'n':
                                pos++;
                                symbol = '\n';
                                break;
                            case 'r':
                                pos++;
                                symbol = '\r';
                                break;
                            default:
                                break;
                            }
                        }
                        info.fmtString_.push_back(symbol);
                    }
                    if (!expect(source, &pos, ";")) {
                        return false;
                    }
                    printf_[index] = info;
                    baseFunc->flags_ |= Kernel::PrintfOutput;
                    // Process next token ...
                    continue;
                }
                else if (ArgState[k].type_ == KernelArg::MetadataVersion) {
                    continue;
                }

                // Read the index
                if (!getuint(source, &pos, &index)) {
                    return false;
                }

                switch (ArgState[k].type_) {
                case KernelArg::PrivateSize:
                    baseFunc->privateSize_ = index;
                    continue;
                case KernelArg::LocalSize:
                    baseFunc->localSize_ = index;
                    continue;
                case KernelArg::HwPrivateSize:
                    baseFunc->hwPrivateSize_ = index;
                    continue;
                case KernelArg::HwLocalSize:
                    baseFunc->hwLocalSize_ = index;
                    continue;
                default:
                    break;
                }

                if (!ArgState[k].size_) {
                    // Find the base function
                    baseFunc = findILFunc(index);
                    if (baseFunc == NULL) {
                        return false;
                    }
                    // Sanity check
                    if (baseFunc->state_ != ILFunc::Unknown) {
                        buildLog_ = "Error: Creating kernel ";
                        buildLog_ += baseFunc->name_;
                        buildLog_ += " failed!\n";
                        LogError(buildLog_.c_str());
                        continue;
                    }
                    // If we have __OpenCL_ prefix in the name
                    // and _kernel suffix, then this is a kernel function
                    const std::string prefix = "__OpenCL_";
                    const std::string postfix = "_kernel";
                    const std::string &fname = baseFunc->name_;
                    size_t namelen = fname.size();
                    size_t postfixPos = namelen - postfix.size();
                    if (fname.compare(0, prefix.size(), prefix) == 0 &&
                        fname.compare(postfixPos, namelen, postfix) == 0) {
                        baseFunc->state_ = ILFunc::Kernel;
                        baseFunc->name_.erase(postfixPos, postfix.size());
                        baseFunc->name_.erase(0, prefix.size());
                    }
                    else {
                        baseFunc->state_ = ILFunc::Regular;
                    }
                    baseFunc->metadata_.begin_ = posBegin;
                    baseFunc->metadata_.end_   = posEnd;
                    continue;
                }

                // Process metadata
                for (j = 0; j < index; ++j) {
                    // Read the index
                    if (getuint(source, &pos, &funcIndex)) {
                        bool    error = false;
                        if (ArgState[k].name_) {
                            ILFunc* func = findILFunc(funcIndex);
                            if (NULL != func) {
                                baseFunc->calls_.push_back(func);
                            }
                            else {
                                buildLog_ += "Error: Undeclared function index ";
                                error = true;
                            }
                        }
                        else {
                            if (funcIndex != 0xffffffff) {
                                baseFunc->macros_.push_back(funcIndex);
                            }
                            else {
                                buildLog_ += "Error: Undeclared macro index ";
                                error = true;
                            }
                        }
                        if (error) {
                            char    str[8];
                            intToStr(funcIndex, str, 8);
                            buildLog_ += str;
                            buildLog_ += "\n";
                            LogError("Undeclared index!");
                            return false;
                        }
                    }
                    else {
                        return false;
                    }
                }
            }
        }
        // Next argument
        pos = source.find(";", pos);
    }
    return true;
}

bool
NullProgram::parseKernels(const std::string& source)
{
    size_t  pos = 0;

    // Strip out all the debug tokens as these are
    // not needed yet, but will be used later.
    while(1) {
        pos = source.find(";DEBUGSTART", pos);
        if (pos == std::string::npos) {
            break;
        }
        size_t last = source.find(";DEBUGEND", pos);
        const_cast<std::string&>(source).erase(pos, last - pos + 10);
        pos = last;
    }
    // Create a list of all functions in the program
    if (!parseAllILFuncs(source)) {
        return false;
    }
    pos = 0;
    // Find all available metadata structures
    for (size_t i = 0; i < funcs_.size(); ++i) {
        char    funcName[256];
        ILFunc::SourceRange range;

        // Find function metadata start
        range.begin_ = pos = source.find(";ARGSTART:", pos);
        if (pos == std::string::npos) {
            break;
        }

        // Find function metadata end
        pos = source.find(";ARGEND:", pos);
        if (!expect(source, &pos, ";ARGEND:")) {
            break;
        }
        // Read the function's name
        if (!getword(source, &pos, funcName)) {
            return false;
        }
        pos = source.find_first_not_of(" \n\r", pos);
        range.end_ = pos;
        if (!parseFuncMetadata(source, range.begin_, range.end_)) {
            return false;
        }
    }
    return true;
}

void NullProgram::freeAllILFuncs()
{
    for (size_t i = 0; i < funcs_.size(); ++i) {
        delete funcs_[i];
    }
    funcs_.clear();
}

ILFunc*
NullProgram::findILFunc(uint index)
{
    for (size_t i = 0; i < funcs_.size(); ++i) {
        if (funcs_[i]->index_ == index) {
            return funcs_[i];
        }
    }
    return NULL;
}

NullKernel*
NullProgram::createKernel(
    const std::string&  name, const Kernel::InitData* initData,
    const std::string&  code, const std::string&  metadata, bool* created,
    const void* binaryCode, size_t binarySize)
{
    amd::option::Options *options = getCompilerOptions();
    uint64_t start_time = 0;
    if (options->oVariables->EnableBuildTiming) {
        start_time = amd::Os::timeNanos();
    }

    *created = false;
    // Create a GPU kernel
    NullKernel* gpuKernel = new NullKernel(name,
        static_cast<const gpu::NullDevice&>(device()), *this);

    if (gpuKernel == NULL) {
        buildLog_ += "new Kernel() failed";
        LogPrintfError("new Kernel() failed for kernel %s!",
            name.c_str());
        return NULL;
    }
    else if (gpuKernel->create(code, metadata, binaryCode, binarySize)) {
        // Add kernel to the program
        kernels()[gpuKernel->name()] = gpuKernel;
        buildLog_ += gpuKernel->buildLog();
    }
    else {
        buildError_ = gpuKernel->buildError();
        buildLog_ += gpuKernel->buildLog();
        delete gpuKernel;
        LogPrintfError("Kernel creation failed for kernel %s!", name.c_str());
        return NULL;
    }

    if (options->oVariables->EnableBuildTiming) {
        std::stringstream tmp_ss;
        tmp_ss << "    Time for creating kernel ("
               << name << ") : "
               << (amd::Os::timeNanos() - start_time)/1000ULL
               << " us\n";
        buildLog_ += tmp_ss.str();
    }

    *created = true;
    return gpuKernel;
}

// Invoked from ClBinary
bool
NullProgram::getAllKernelILs(std::map<std::string, std::string>& allKernelILs,
                             std::string& programIL, const char* ilKernelName)
{
    llvm::CompUnit compunit (programIL);
    if (ilKernelName != NULL) {
        std::string MangeledName("__OpenCL_");
        MangeledName.append(ilKernelName);
        MangeledName.append("_kernel");
        for (int i=0; i < static_cast<int>(compunit.getNumKernels()); ++i) {
            std::string kernelname = compunit.getKernelName(i);
            if (kernelname.compare(MangeledName) == 0) {
                allKernelILs[kernelname] = compunit.getKernelStr(i);
                break;
            }
        }
    }
    else {
        for (int i=0; i < static_cast<int>(compunit.getNumKernels()); ++i) {
            std::string kernelname = compunit.getKernelName(i);
            allKernelILs[kernelname] = compunit.getKernelStr(i);
        }
    }
    return true;
}

bool
NullProgram::createBinary(amd::option::Options* options)
{
    if (options->oVariables->BinBIF30) {
        return true;
    }

    if (!clBinary()->createElfBinary(options->oVariables->BinEncrypt,
                                      type())) {
        LogError("Failed to create ELF binary image!");
        return false;
    }
    return true;
}

Program::~Program()
{
    // Destroy the global HW constant buffers
    const Program::HwConstBuffers& gds = glbHwCb();
    for (Program::HwConstBuffers::const_iterator it = gds.begin(); it != gds.end(); ++it) {
        delete it->second;
    }

    // Destroy the global data store
    if (glbData_ != NULL) {
        delete glbData_;
    }
}

bool
Program::allocGlobalData(const void* globalData, size_t dataSize, uint index)
{
    bool result = false;
    gpu::Memory* dataStore = NULL;

    if (index == 0) {
        // We have to lock the heap block allocation,
        // so possible reallocation won't occur twice or
        // another thread could destroy a heap block,
        // while we didn't finish allocation
        amd::ScopedLock k(dev().lockAsyncOps());

        // Allocate memory for the global data store
        glbData_ = dev().createScratchBuffer(amd::alignUp(dataSize, 0x1000));
        dataStore = glbData_;
    }
    else {
        dataStore = new Memory(dev(), amd::alignUp(dataSize, ConstBuffer::VectorSize));

        // Initialize constant buffer
        if ((dataStore == NULL) || !dataStore->create(Resource::RemoteUSWC)) {
            delete dataStore;
        }
        else {
            constBufs_[index] = dataStore;
            glbCb_.push_back(index);
        }
    }

    if (dataStore != NULL) {
        // Upload data to GPU memory
        static const bool Entire  = true;
        amd::Coord3D    origin(0, 0, 0);
        amd::Coord3D    region(dataSize);
        result = dev().xferMgr().writeBuffer(globalData,
            *dataStore, origin, region, Entire);
    }

    return result;
}

bool
Program::loadBinary(bool* hasRecompile)
{
    if (clBinary()->loadKernels(*this, hasRecompile)) {
        // Load the global data
        if (clBinary()->loadGlobalData(*this)) {
            return true;
        }
    }

    // Make sure that kernels that have been generated so far shall be deleted.
    clear();

    return false;
}

HSAILProgram::HSAILProgram(Device& device)
    : Program(device)
    , llvmBinary_()
    , binaryElf_(NULL)
    , rawBinary_(NULL)
    , globalStore_(NULL)
    , kernels_(NULL)
    , maxScratchRegs_(0)
{
    memset(&binOpts_, 0, sizeof(binOpts_));
    binOpts_.struct_size = sizeof(binOpts_);
    binOpts_.elfclass = LP64_SWITCH(ELFCLASS32, ELFCLASS64);
    binOpts_.bitness = ELFDATA2LSB;
    binOpts_.alloc = &::malloc;
    binOpts_.dealloc = &::free;
}

HSAILProgram::~HSAILProgram()
{
    // Destroy internal static samplers
    for (auto it = staticSamplers_.begin(); it != staticSamplers_.end(); ++it) {
        delete *it;
    }
    if (rawBinary_ != NULL) {
        free( rawBinary_ );
    }
    acl_error error;
    // Free the elf binary
    if (binaryElf_ != NULL) {
        error = aclBinaryFini(binaryElf_);
        if (error != ACL_SUCCESS) {
            LogWarning( "Error while destroying the acl binary \n" );
        }
    }
    delete globalStore_;
    delete kernels_;
}

bool
HSAILProgram::initBuild(amd::option::Options *options)
{
    if (!device::Program::initBuild(options)) {
        return false;
    }

    const char* devName = dev().hwInfo()->machineTarget_;
    options->setPerBuildInfo(
        (devName && (devName[0] != '\0')) ? devName : "gpu",
        clBinary()->getEncryptCode(), true);

    // Elf Binary setup
    std::string outFileName;

    // true means fsail required
    clBinary()->init(options, true);
    if (options->isDumpFlagSet(amd::option::DUMP_BIF)) {
      outFileName = options->getDumpFileName(".bin");
    }

    if (!clBinary()->setElfOut(LP64_SWITCH(ELFCLASS32, ELFCLASS64),
        (outFileName.size() > 0) ? outFileName.c_str() : NULL)) {
        LogError("Setup elf out for gpu failed");
        return false;
    }
    return true;
}

bool
HSAILProgram::finiBuild(bool isBuildGood)
{
    clBinary()->resetElfOut();
    clBinary()->resetElfIn();

    if (!isBuildGood) {
        // Prevent the encrypted binary form leaking out
        clBinary()->setBinary(NULL, 0);
    }

    return device::Program::finiBuild(isBuildGood);
}

bool
HSAILProgram::linkImpl(
    const std::vector<device::Program *> &inputPrograms,
    amd::option::Options *options,
    bool createLibrary)
{
    std::vector<device::Program *>::const_iterator it
        = inputPrograms.begin();
    std::vector<device::Program *>::const_iterator itEnd
        = inputPrograms.end();
    acl_error errorCode;

    // For each program we need to extract the LLVMIR and create
    // aclBinary for each
    std::vector<aclBinary *> binaries_to_link;

    for (size_t i = 0; it != itEnd; ++it, ++i) {
        HSAILProgram *program = (HSAILProgram *)*it;
        // Check if the program was created with clCreateProgramWIthBinary
        binary_t binary = program->binary();
        if ((binary.first != NULL) && (binary.second > 0)) {
            // Binary already exists -- we can also check if there is no
            // opencl source code
            // Need to check if LLVMIR exists in the binary
            // If LLVMIR does not exist then is it valid
            // We need to pull out all the compiled kernels
            // We cannot do this at present because we need at least
            // Hsail text to pull the kernels oout
            void *mem = const_cast<void *>(binary.first);
            binaryElf_ = aclReadFromMem(mem, binary.second, &errorCode);
            if (errorCode != ACL_SUCCESS) {
                LogWarning("Error while linking : Could not read from raw binary");
                return false;
            }
        }
        // At this stage each HSAILProgram contains a valid binary_elf
        // Check if LLVMIR is in the binary
        // @TODO - Memory leak , cannot free this buffer
        // need to fix this.. File EPR on compiler library
        size_t llvmirSize = 0;
        const void *llvmirText = aclExtractSection(dev().hsaCompiler(),
            binaryElf_, &llvmirSize, aclLLVMIR, &errorCode);
        if (errorCode != ACL_SUCCESS) {
            buildLog_ +="Error while linking : \
                        Invalid binary (Missing LLVMIR section)" ;
            return false;
        }
        // Create a new aclBinary for each LLVMIR and save it in a list
        aclBIFVersion ver = aclBinaryVersion(binaryElf_);
        aclBinary *bin = aclCreateFromBinary(binaryElf_, ver);
        binaries_to_link.push_back(bin);
    }

    // At this stage each HSAILProgram in the list has an aclBinary initialized
    // and contains LLVMIR
    // We can now go ahead and link them.
    if (binaries_to_link.size() > 1) {
        errorCode = aclLink(dev().hsaCompiler(),
            binaries_to_link[0], binaries_to_link.size() - 1,
            &binaries_to_link[1], ACL_TYPE_LLVMIR_BINARY, "-create-library", NULL);
    }
    // Store the newly linked aclBinary for this program.
    binaryElf_ = binaries_to_link[0];
    // Free all the other aclBinaries
    for (size_t i = 1; i < binaries_to_link.size(); i++) {
        aclBinaryFini(binaries_to_link[i]);
    }

    // Now call linkImpl with the new options
    return linkImpl(options);
}

aclType
HSAILProgram::getNextCompilationStageFromBinary(std::vector<aclType>& complete_stages)
{
    acl_error errorCode;
    size_t secSize = 0;
    complete_stages.clear();
    aclType from = ACL_TYPE_DEFAULT;
    //@TODO_HSA: r=emankov: Should we also check here for
    // ACL_TYPE_OPENCL & ACL_TYPE_LLVMIR_TEXT?

    // Checking llvmir in .llvmir section
    bool isLlvmirText = true;
    const void *llvmirText = aclExtractSection(dev().hsaCompiler(),
        binaryElf_, &secSize, aclLLVMIR, &errorCode);
    if (errorCode != ACL_SUCCESS) {
        isLlvmirText = false;
    }
    // Checking compile & link options in .comment section
    bool isOpts = true;
    const void* opts = aclExtractSection(dev().hsaCompiler(),
        binaryElf_, &secSize, aclCOMMENT, &errorCode);
    if (errorCode != ACL_SUCCESS) {
        isOpts = false;
    }
    if (isLlvmirText && isOpts) {
        complete_stages.push_back(from);
        from = ACL_TYPE_LLVMIR_BINARY;
    }
    bool isHsailText = true;
    // Checking HSAIL in .cg section
    const void *hsailText = aclExtractSection(dev().hsaCompiler(),
        binaryElf_, &secSize, aclCODEGEN, &errorCode);
    if (errorCode != ACL_SUCCESS) {
        isHsailText = false;
    }
    // Checking BRIG STRTAB in .brig_strtab section
    bool isBrigStrtab = true;
    const void *brigStrtab = aclExtractSection(dev().hsaCompiler(),
        binaryElf_, &secSize, aclBRIGstrs, &errorCode);
    if (errorCode != ACL_SUCCESS) {
        isBrigStrtab = false;
    }
    // Checking BRIG CODE in .brig_code section
    bool isBrigCode = true;
    const void *brigCode = aclExtractSection(dev().hsaCompiler(),
        binaryElf_, &secSize, aclBRIGcode, &errorCode);
    if (errorCode != ACL_SUCCESS) {
        isBrigCode = false;
    }
    // Checking BRIG OPERANDS in .brig_operands section
    bool isBrigOps = true;
    const void *brigOps = aclExtractSection(dev().hsaCompiler(),
        binaryElf_, &secSize, aclBRIGoprs, &errorCode);
    if (errorCode != ACL_SUCCESS) {
        isBrigOps = false;
    }
    if (isBrigStrtab && isBrigCode && isBrigOps) {
        complete_stages.push_back(from);
        from = ACL_TYPE_HSAIL_BINARY;
        // Here we should check that CG stage was done.
        // Right now there are 2 criterions to check it (besides BRIG itself):
        // 1. matadata symbols symOpenclKernel for every kernel.
        // 2. HSAIL text in aclCODEGEN section.
        // Unfortunately there is no appropriate way in Compiler Lib to check 1.
        // because kernel names are unknown here, therefore 2.

        //@TODO_HSA: r=emankov: Change the HSAIL section check,
        // when the solution with kernel names appears in Compiler Lib.
        if (isHsailText) {
            complete_stages.push_back(from);
            from = ACL_TYPE_CG;
        }
    }
    else if (isHsailText) {
        complete_stages.push_back(from);
        from = ACL_TYPE_HSAIL_TEXT;
    }
    // Checking ISA in .text section
    bool isShaderIsa = true;
    const void *shaderIsa = aclExtractSection(dev().hsaCompiler(),
        binaryElf_, &secSize, aclTEXT, &errorCode);
    if (errorCode != ACL_SUCCESS) {
        isShaderIsa = false;
    }
    if (isShaderIsa && from == ACL_TYPE_LLVMIR_BINARY) {
        complete_stages.clear();
        from = ACL_TYPE_DEFAULT;
    }
    if (complete_stages.empty()) {
        complete_stages.push_back(from);
    }
    return from;
}

bool
HSAILProgram::linkImpl(amd::option::Options* options)
{
    acl_error errorCode;
    aclType continueCompileFrom = ACL_TYPE_LLVMIR_BINARY;
    // If !binaryElf_ then program must have been created using clCreateProgramWithBinary
    if (!binaryElf_) {
        binary_t binary = this->binary();
        // If the binary already exists
        if ((binary.first != NULL) && (binary.second > 0)) {
            void *mem = const_cast<void *>(binary.first);
            binaryElf_ = aclReadFromMem(mem, binary.second, &errorCode);
            if (errorCode != ACL_SUCCESS) {
                buildLog_ += "Error while BRIG Codegen phase: aclReadFromMem failure \n" ;
                LogWarning("aclReadFromMem failed");
                return false;
            }
            // Calculate the next stage to compile from, based on sections in binaryElf_;
            // No any validity checks here
            std::vector<aclType> complete_stages;
            continueCompileFrom = getNextCompilationStageFromBinary(complete_stages);
            //@TODO_HSA: r=emankov: Should we also check here for
            // ACL_TYPE_OPENCL & ACL_TYPE_LLVMIR_TEXT to recompile from?
            if (ACL_TYPE_DEFAULT == continueCompileFrom) {
                buildLog_ += "Error while BRIG Codegen phase: the binary is incomplete \n" ;
                return false;
            }
            if (ACL_TYPE_HSAIL_BINARY == continueCompileFrom) {
                // Saving binary in the interface class,
                // which also load compile & link options from binary
                setBinary(static_cast<char*>(mem), binary.second);
                // Check the previous completed stage
                if (ACL_TYPE_LLVMIR_BINARY == complete_stages.back()) {
                    // Compare options loaded from binary with current ones
                    // If they differ then recompile from ACL_TYPE_LLVMIR_BINARY
                    //@TODO_HSA: r=emankov: Should we need to check options at all?
                    std::string curOptions = options->origOptionStr + hsailOptions();
                    if (compileOptions_ + linkOptions_ != curOptions) {
                       continueCompileFrom = ACL_TYPE_LLVMIR_BINARY;
                    }
                }
            }
        }
    }
    switch (continueCompileFrom) {
      default:
        break;
      // Compilation from ACL_TYPE_LLVMIR_BINARY to ACL_TYPE_CG in cases:
      // 1. if the program is not created with binary;
      // 2. if the program is created with binary and contains only .llvmir & .comment
      // 3. if the program is created with binary, contains .llvmir, .comment, brig sections,
      //    but the binary's compile & link options differ from current ones (recompilation);
      case ACL_TYPE_LLVMIR_BINARY:
      // Compilation from ACL_TYPE_HSAIL_BINARY to ACL_TYPE_CG in cases:
      // 1. if the program is created with binary and contains only brig sections
      case ACL_TYPE_HSAIL_BINARY:
      // Compilation from ACL_TYPE_HSAIL_TEXT to ACL_TYPE_CG in cases:
      // 1. if the program is created with binary and contains only hsail text
      case ACL_TYPE_HSAIL_TEXT:
      {
          std::string curOptions = options->origOptionStr + hsailOptions();
          errorCode = aclCompile(dev().hsaCompiler(), binaryElf_,
              curOptions.c_str(), continueCompileFrom, ACL_TYPE_CG, NULL);
          buildLog_ += aclGetCompilerLog(dev().hsaCompiler());
          break;
      }
    }
    if (errorCode != ACL_SUCCESS) {
        buildLog_ += "Error while BRIG Codegen phase: compilation error \n" ;
        return false;
    }

    if (!aclHsaLoader(dev().hsaCompiler(), binaryElf_, this, &AllocateGPUMemory,
        &DmaMemoryCopy, &GetSamplerObjectParams, &InitializeSamplerObject)) {
        buildLog_ += "Error while BRIG Codegen phase: loading BRIG globals in the ELF \n";
        return false;
    }
    // We need to pull out kernels' names for finalizing kernels
    //@TODO_HSA: r=emankov: rewrite the below code,
    // if another way to obtain kernel names appears in the compiler library
    size_t fsailSize;
    const oclBIFSymbolStruct* symbol = findBIF30SymStruct(symHSAILText);
    assert(symbol && "symbol not found");
    std::string symName = symbol->str[PRE] + std::string("main") + symbol->str[POST];
    const void *hsailText = aclExtractSymbol(dev().hsaCompiler(),
        binaryElf_,
        &fsailSize,
        aclCODEGEN,
        symName.c_str(),
        &errorCode);
    if (errorCode != ACL_SUCCESS) {
        buildLog_ += "Error while reading out the HSAIL from the ELF \n" ;
        return false;
    }
    std::string hsailProgram((char *)hsailText);
    HSAILProgram_ = hsailProgram;
    if (!HSAILProgram_.empty()) {
        bool    dynamicParallelism = false;
        // Find out the name of the kernel. Works for multiple kernels
        int pos = 0;
        while (true) {
            std::string findString = "kernel &";
            size_t kernelNPos = HSAILProgram_.find(findString, pos);
            if (kernelNPos == std::string::npos) {
                break;
            }
            size_t kernelEndNPos = HSAILProgram_.find("l(", kernelNPos);
            pos = kernelEndNPos + 1;
            if (kernelEndNPos == std::string::npos) {
                break;
            }
            // "kernel &" is 8
            // "__OpenCL_" is 9
            // "_kerne" is 6
            // We can drop all this with a compiler tweak later
            std::string kernelName = HSAILProgram_.substr(kernelNPos + 8 + 9,
                kernelEndNPos -
                (kernelNPos + 8 + 9) - 6);
            HSAILKernel *aKernel = new HSAILKernel(kernelName, this,
                options->origOptionStr + hsailOptions());
            if (!aKernel->init() ) {
                return false;
            }
            buildLog_ += aKernel->buildLog();
            aKernel->setUniformWorkGroupSize(options
              ->oVariables->UniformWorkGroupSize);
            kernels()[kernelName] = aKernel;
            dynamicParallelism |= aKernel->dynamicParallelism();
            // Find max scratch regs used in the program
            // It's used for scratch buffer preallocation with
            // dynamic parallelism, since runtime doesn't know
            // which child kernel will be called
            maxScratchRegs_ = std::max(
                static_cast<uint>(aKernel->workGroupInfo()->scratchRegs_),
                maxScratchRegs_);
        }

        // Allocate kernel table for device enqueuing
        if (dynamicParallelism && !allocKernelTable()) {
            return false;
        }

        // Save the binary in the interface class
        size_t size = 0;
        void *mem = NULL;
        aclWriteToMem(binaryElf_, &mem, &size);
        setBinary(static_cast<char*>(mem), size);
        buildLog_ += aclGetCompilerLog(dev().hsaCompiler());
        return true;
    }
    return false;
}

bool
HSAILProgram::createBinary(amd::option::Options *options)
{
    return false;
}

bool
HSAILProgram::initClBinary()
{
    if (clBinary_ == NULL) {
        clBinary_ = new ClBinaryHsa(static_cast<const Device &>(device()));
        if (clBinary_ == NULL) {
            return false;
        }
    }
    return true;
}

void
HSAILProgram::releaseClBinary()
{
    if (clBinary_ != NULL) {
        delete clBinary_;
        clBinary_ = NULL;
    }
}

std::string
HSAILProgram::hsailOptions()
{
    std::string hsailOptions;
    // Set options for the standard device specific options
    // All our devices support these options now
    if (dev().settings().reportFMAF_) {
        hsailOptions.append(" -DFP_FAST_FMAF=1");
    }
    if (dev().settings().reportFMA_) {
        hsailOptions.append(" -DFP_FAST_FMA=1");
    }

    // Check if the host is 64 bit or 32 bit
    LP64_ONLY(hsailOptions.append(" -m64"));

    // Append each extension supported by the device
    std::string token;
    std::istringstream iss("");
    iss.str(device().info().extensions_);
    while (getline(iss, token, ' ')) {
        if (!token.empty()) {
            hsailOptions.append(" -D");
            hsailOptions.append(token);
            hsailOptions.append("=1");
        }
    }
    return hsailOptions;
}

bool
HSAILProgram::allocKernelTable()
{
    uint size = kernels().size() * sizeof(size_t);

    kernels_ = new gpu::Memory(dev(), size);
    // Initialize kernel table
    if ((kernels_ == NULL) || !kernels_->create(Resource::RemoteUSWC)) {
        delete kernels_;
        return false;
    }
    else {
        size_t* table = reinterpret_cast<size_t*>(
            kernels_->map(NULL, gpu::Resource::WriteOnly));
        for (auto it = kernels().begin(); it != kernels().end(); ++it) {
            HSAILKernel* kernel = static_cast<HSAILKernel*>(it->second);
            table[kernel->index()] = static_cast<size_t>(
                kernel->gpuAqlCode()->vmAddress());
        }
        kernels_->unmap(NULL);
    }
    return true;
}

void
HSAILProgram::fillResListWithKernels(
    std::vector<const Resource*>& memList) const
{
    for (auto it = kernels().begin(); it != kernels().end(); ++it) {
        memList.push_back(
            static_cast<HSAILKernel*>(it->second)->gpuAqlCode());
    }
}


} // namespace gpu