P4 to Git Change 1170297 by nhaustov@nhaustov_hsa on 2015/07/14 05:36:10

ECR #333753 - ORCA RT/Compiler Lib: HSA Code Object/RT independent loader introducing/integration into OpenCL. Changes by Evgeniy Mankov. Purpose: Use the same Finalizer & loader for both HSA & ORCA RT. AMDIL path is not affected. Changes: 1. The whole BRIG is finalized now instead of per kernel finalization (both in gpuprogram & hsail_be). 2. HSALoader is changed in order to work with CodeObject and new HSA Loader's API <96> Context. Now it is in ORCA<92>s gpuprogram instead of Compiler Lib. 3. brig_loader.cpp is removed from compiler lib, as well as __aclHSALoader function exports from the whole stack. 4. BIF .text section now contains the whole finalized HSA CodeObject instead of separate symbols for finalized kernels. 5. ORCA RT now works directly with amd_kernel_code_t and doesn't need any SC metadata anymore. 6. aoc2 is supplemented with fake offline loader correspondingly. 7. amdocl/complib make sytem changes. 8. test_driver.pl update. ToDo: 1. Implement disassemble() & BuildLog() functions to support ISA dumping & SC error handling (Konstantin). 2. Global variables initialization by pragma reference (Konstantin). Test to verify: test_basic progvar_prog_scope_init. 3. Code Object without kernels support (Nikolay - ready). Test to verify: test_generic_address_space.exe library_function testing: windows smoke, pre check-in, ocl conformance 2.0, ocl SDK 2.9 Reviewers: Nikolay Haustov, German Andryeyev Affected files ... ... //depot/stg/opencl/drivers/opencl/api/opencl/amdocl/amdocl.def.in#13 edit ... //depot/stg/opencl/drivers/opencl/api/opencl/amdocl/amdocl.map.in#15 edit ... //depot/stg/opencl/drivers/opencl/api/opencl/amdocl/build/Makefile.api#116 edit ... //depot/stg/opencl/drivers/opencl/compiler/legacy-lib/amdoclcl.def.in#2 edit ... //depot/stg/opencl/drivers/opencl/compiler/legacy-lib/amdoclcl.map.in#2 edit ... //depot/stg/opencl/drivers/opencl/compiler/lib/amdoclcl.def.in#12 edit ... //depot/stg/opencl/drivers/opencl/compiler/lib/amdoclcl.map.in#11 edit ... //depot/stg/opencl/drivers/opencl/compiler/lib/backends/common/v0_8/if_acl.cpp#70 edit ... //depot/stg/opencl/drivers/opencl/compiler/lib/backends/gpu/build/Makefile.gpu#32 edit ... //depot/stg/opencl/drivers/opencl/compiler/lib/backends/gpu/hsail_be.cpp#44 edit ... //depot/stg/opencl/drivers/opencl/compiler/lib/build/Makefile.complib#85 edit ... //depot/stg/opencl/drivers/opencl/compiler/lib/utils/v0_8/libUtils.cpp#9 edit ... //depot/stg/opencl/drivers/opencl/compiler/lib/utils/v0_8/libUtils.h#18 edit ... //depot/stg/opencl/drivers/opencl/compiler/tools/aoc2/aoc2.cpp#70 edit ... //depot/stg/opencl/drivers/opencl/compiler/tools/aoc2/build/Makefile.aoc2#24 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/device.hpp#248 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudefs.hpp#121 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpukernel.cpp#288 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpukernel.hpp#112 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuprogram.cpp#194 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuprogram.hpp#59 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuscsi.cpp#33 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuvirtual.cpp#368 edit ... //depot/stg/opencl/drivers/opencl/tests/hsa/bin/test_driver.pl#12 edit [ROCm/clr commit: 8cc3f47661]
2015-07-14 17:08:54 -04:00
commit c3872376e8
@@ -50,6 +50,3 @@ aclJITObjectImageDisassembleKernel
 #endif
 aclJITObjectImageIterateSymbols
 aclJITObjectImageGetGlobalsSize
-#if defined(WITH_TARGET_HSAIL)
-_aclHsaLoader
-#endif
@@ -53,9 +53,6 @@ global:
 #endif
  aclJITObjectImageIterateSymbols;
  aclJITObjectImageGetGlobalsSize;
-#if defined(WITH_TARGET_HSAIL)
-  _aclHsaLoader;
-#endif
 #if defined(OPENCL_MAINLINE)
 local:
  *;
@@ -2092,11 +2092,17 @@ if_aclGetDeviceBinary(aclCompiler *cl,
    size_t *size,
    acl_error *error_code)
 {
-  const oclBIFSymbolStruct* symbol = findBIF30SymStruct(symISABinary);
-  assert(symbol && "symbol not found");
-  std::string kernelName = symbol->str[PRE] + std::string(kernel) + symbol->str[POST];
-  return cl->clAPI.extSym(cl, bin, size,
-      symbol->sections[0], kernelName.c_str(), error_code);
+#ifdef WITH_TARGET_HSAIL
+  if (isHSAILTarget(bin->target)) {
+    return cl->clAPI.extSec(cl, bin, size, aclTEXT, error_code);
+  } else
+#endif
+  {
+    const oclBIFSymbolStruct* sym = findBIF30SymStruct(symISABinary);
+    assert(sym && "symbol not found");
+    std::string name = sym->str[PRE] + std::string(kernel) + sym->str[POST];
+    return cl->clAPI.extSym(cl, bin, size, sym->sections[0], name.c_str(), error_code);
+  }
 }

 acl_error  ACL_API_ENTRY
@@ -17,6 +17,12 @@ extern aclBinary* constructBinary(size_t struct_version,
    const aclTargetInfo *target,
    const aclBinaryOptions *opts);

+static const std::string sgfx700 = "AMD:AMDGPU:7:0:0";
+static const std::string sgfx701 = "AMD:AMDGPU:7:0:1";
+static const std::string sgfx800 = "AMD:AMDGPU:8:0:0";
+static const std::string sgfx801 = "AMD:AMDGPU:8:0:1";
+static const std::string sgfx900 = "AMD:AMDGPU:9:0:0";
+
 // Utility function to set a flag in option structure
 // of the aclDevCaps.
 void
@@ -501,6 +507,54 @@ unsigned getChipEnum(const aclTargetInfo *target)
  return Mapping.chip_enum;
 }

+/*! Function that returns isa type name (compute capability) from
+ *the TargetMapping table for the specific target device id.
+ */
+const std::string &getIsaTypeName(const aclTargetInfo *target)
+{
+  const TargetMapping& Mapping = getTargetMapping(*target);
+  switch (Mapping.family_enum) {
+    default: return sgfx700;
+    case FAMILY_KV:
+      switch (Mapping.chip_enum) {
+        default: return sgfx700;
+        case KV_SPECTRE_A0:
+        case KV_SPOOKY_A0:
+        case KB_KALINDI_A0:
+        // ???
+        case ML_GODAVARI_A0: return sgfx700;
+      }
+    case FAMILY_CI:
+      switch (Mapping.chip_enum) {
+        default: return sgfx700;
+        case CI_BONAIRE_M_A0:
+        case CI_BONAIRE_M_A1: return sgfx700;
+        case CI_HAWAII_P_A0: return sgfx701;
+        case CI_TIRAN_P_A0:
+        case CI_MAUI_P_A0: return sgfx700;
+      }
+    case FAMILY_VI:
+      switch (Mapping.chip_enum) {
+        default: return sgfx800;
+        case VI_ICELAND_M_A0:
+        case VI_TONGA_P_A0: return sgfx800;
+        case VI_ELLESMERE_P_A0:
+        case VI_BAFFIN_M_A0:
+        case VI_FIJI_P_A0: return sgfx801;
+      }
+    case FAMILY_CZ:
+      switch (Mapping.chip_enum) {
+        default: return sgfx801;
+        case CARRIZO_A0: return sgfx801;
+      }
+    case FAMILY_AI:
+      switch (Mapping.chip_enum) {
+        default: return sgfx900;
+        case AI_GREENLAND_P_A0: return sgfx900;
+      }
+  }
+}
+
 void
 appendLogToCL(aclCompiler *cl, const std::string &logStr)
 {
@@ -36,16 +36,19 @@ initElfDeviceCaps(aclBinary *elf);
 void
 appendLogToCL(aclCompiler *cl, const std::string &logStr);

-const char *getDeviceName(const aclTargetInfo &Target);
+const char *getDeviceName(const aclTargetInfo &target);

 // Select the correct library from the target information.
-amd::LibrarySelector getLibraryType(const aclTargetInfo *Target);
+amd::LibrarySelector getLibraryType(const aclTargetInfo *target);

 // get family_enum from the target information.
-unsigned getFamilyEnum(const aclTargetInfo *Target);
+unsigned getFamilyEnum(const aclTargetInfo *target);

 // get chip_enum from the target information.
-unsigned getChipEnum(const aclTargetInfo *Target);
+unsigned getChipEnum(const aclTargetInfo *target);
+
+// get isa type name (compute capability) from the target information.
+const std::string &getIsaTypeName(const aclTargetInfo *target);

 // Create a copy of an ELF and duplicate all sections/symbols
 aclBinary*
@@ -940,6 +940,8 @@ public:
    //! Return the build log
    const std::string& buildLog() const { return buildLog_; }

+    static std::string openclMangledName(const std::string& name) { return "&__OpenCL_" + name + "_kernel"; }
+
 protected:
    std::string     name_;              //!< kernel name
    WorkGroupInfo   workGroupInfo_;     //!< device kernel info structure
@@ -128,6 +128,13 @@ static const AMDDeviceInfo DeviceInfo[] = {
 /* CAL_TARGET_GREENLAND */  { ED_ATI_CAL_MACHINE_GREENLAND_ISA, "",  "",   4, 16, 1, 256, 64 * Ki, 32, 900 },
 };

+static const char* Gfx700 = "AMD:AMDGPU:7:0:0";
+static const char* Gfx701 = "AMD:AMDGPU:7:0:1";
+static const char* Gfx800 = "AMD:AMDGPU:8:0:0";
+static const char* Gfx801 = "AMD:AMDGPU:8:0:1";
+static const char* Gfx810 = "AMD:AMDGPU:8:1:0";
+static const char* Gfx900 = "AMD:AMDGPU:9:0:0";
+
 // Supported OpenCL versions
 enum OclVersion {
    OpenCL10,
@@ -3552,12 +3552,12 @@ HSAILKernel::~HSAILKernel()
 }

 bool
-HSAILKernel::init(bool finalize)
+HSAILKernel::init(amd::hsa::loader::Symbol *sym, bool finalize)
 {
    acl_error error;
-    const oclBIFSymbolStruct* sym = findBIF30SymStruct(symOpenclKernel);
-    assert(sym && "symbol not found");
-    std::string openClKernelName(std::string("&") + sym->str[PRE] + name() + sym->str[POST]);
+    const oclBIFSymbolStruct* bifSym = findBIF30SymStruct(symOpenclKernel);
+    assert(bifSym && "symbol not found");
+    std::string openClKernelName(std::string("&") + bifSym->str[PRE] + name() + bifSym->str[POST]);
    //compile kernel down to ISA
    if (finalize) {
        std::string options(compileOptions_.c_str());
@@ -3578,19 +3578,10 @@ HSAILKernel::init(bool finalize)
            return false;
        }
    }
-    // Get the ISA out
-    size_t  size_isa;
-    void*   shader_isa = NULL;
-    shader_isa = const_cast<void *>(aclGetDeviceBinary(dev().hsaCompiler(),
-        prog().binaryElf(), openClKernelName.c_str(), &size_isa, &error));
-    if (shader_isa == NULL) {
-        LogError("Failed find the ISA");
-        return false;
-    }

    // Allocate HW resources for the real program only
    if (!prog().isNull()) {
-        aqlCreateHWInfo(shader_isa, size_isa);
+        aqlCreateHWInfo(sym);
    }

    // Pull out metadata from the ELF
@@ -4131,8 +4122,8 @@ HSAILKernel::loadArguments(

    memList.push_back(cb);
    memList.push_back(gpuAqlCode());
-    if (NULL != prog().globalStore()) {
-        memList.push_back(prog().globalStore());
+    for (gpu::Memory * mem : prog().globalStores()) {
+        memList.push_back(mem);
    }
    if (AMD_HSA_BITS_GET(cpuAqlCode_->kernel_code_properties,
          AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_QUEUE_PTR)) {
@@ -17,6 +17,15 @@
 #include "device/gpu/gpuprintf.hpp"
 #include "device/gpu/gpuwavelimiter.hpp"
 #include "hsa.h"
+
+namespace amd {
+namespace hsa {
+namespace loader {
+class Symbol;
+} // loader
+} // hsa
+} // amd
+
 //! \namespace gpu GPU Device Implementation
 namespace gpu {

@@ -847,7 +856,7 @@ public:

    //! Initializes the metadata required for this kernel,
    //! finalizes the kernel if needed
-    bool init(bool finalize = true);
+    bool init(amd::hsa::loader::Symbol *sym, bool finalize = false);

    //! Returns true if memory is valid for execution
    virtual bool validateMemory(uint idx, amd::Memory* amdMem) const;
@@ -927,10 +936,7 @@ private:
    HSAILKernel& operator=(const HSAILKernel&);

    //! Creates AQL kernel HW info
-    bool aqlCreateHWInfo(
-        const void* kernel,     //!< Kernel's packed binary info and code
-        size_t kernelSize       //!< Size of the kernel's packed binary
-        );
+    bool aqlCreateHWInfo(amd::hsa::loader::Symbol *sym);

    //! Initializes arguments_ and the abstraction layer kernel parameters
    void initArgList(
@@ -18,135 +18,6 @@
 #include "hsa.h"
 #include "hsa_ext_image.h"

-extern "C" bool
-ACL_API_ENTRY _aclHsaLoader(
-    aclCompiler* compiler_handle,
-    aclBinary* bin,
-    void* userData,
-    void (*allocateGPUMemory)(void* userData, size_t size, uint64_t* GPUMemory),
-    bool (*DmaMemoryCopy)(void* userData, uint64_t offset, const void* pSrc, size_t size),
-    void (*getSamplerObjectParam)(uint32_t* size, uint32_t* alignment),
-    void (*initializeSamplerObject)(void* userData, uint64_t offset, bool unnormalize,
-    uint8_t fltr, uint8_t addrU, uint8_t addrV, uint8_t addrW));
-
-bool
-DmaMemoryCopy(void* userData, uint64_t offset, const void* pSrc, size_t size)
-{
-    gpu::HSAILProgram* prog = reinterpret_cast<gpu::HSAILProgram*>(userData);
-    gpu::Memory* mem = const_cast<gpu::Memory*>(prog->globalStore());
-    if (mem == NULL) {
-        return false;
-    }
-    size_t maxCopySize = prog->globalVariableTotalSize();
-    if (maxCopySize >= size) {
-        maxCopySize = size;
-    }
-    amd::Coord3D origin(offset);
-    amd::Coord3D region(maxCopySize);
-    // memcpy mode
-    if (pSrc) {
-        const bool Entire  = true;
-        return prog->dev().xferMgr().writeBuffer(pSrc, *mem, origin, region, Entire);
-    }
-    // memset mode
-    else {
-        char pattern = 0;
-        return prog->dev().xferMgr().fillBuffer(*mem, &pattern, sizeof(pattern),
-        origin, region);
-    }
-}
-
-void
-AllocateGPUMemory(void* userData, size_t size, uint64_t* GPUMemory)
-{
-    gpu::Memory* mem = NULL;
-    void*   cpuPtr = NULL;
-    gpu::HSAILProgram* prog = reinterpret_cast<gpu::HSAILProgram*>(userData);
-
-    mem = new gpu::Memory(prog->dev(), amd::alignUp(size, gpu::ConstBuffer::VectorSize));
-
-    // Initialize constant buffer
-    if ((mem == NULL) || !mem->create(gpu::Resource::Local)) {
-        delete mem;
-        *GPUMemory = 0;
-        return;
-    }
-    *GPUMemory = mem->vmAddress();
-    prog->setGlobalStore(mem);
-    prog->setGlobalVariableTotalSize(size);
-}
-
-void
-GetSamplerObjectParams(uint32_t* size, uint32_t* alignment)
-{
-    if (GPU_DIRECT_SRD) {
-        *size = gpu::HsaSamplerObjectSize;
-        *alignment = gpu::HsaSamplerObjectAlignment;
-    }
-    else {
-        *size = sizeof(uint64_t);
-        *alignment = sizeof(uint64_t);
-    }
-}
-
-void
-InitializeSamplerObject(void* userData, uint64_t offset, bool unnormalize,
-    uint8_t fltr, uint8_t addrU, uint8_t addrV, uint8_t addrW)
-{
-    assert((addrU == addrV && addrV == addrW) && "GSL supports single address mode");
-    hsa_ext_sampler_filter_mode_t filter =
-        static_cast<hsa_ext_sampler_filter_mode_t>(fltr);
-    hsa_ext_sampler_addressing_mode_t boundaryU =
-        static_cast<hsa_ext_sampler_addressing_mode_t>(addrU);
-
-    uint32_t    state = (unnormalize) ?
-        amd::Sampler::StateNormalizedCoordsFalse : amd::Sampler::StateNormalizedCoordsTrue;
-    if (filter == HSA_EXT_SAMPLER_FILTER_MODE_LINEAR) {
-        state |= amd::Sampler::StateFilterNearest;
-    }
-    else if (filter == HSA_EXT_SAMPLER_FILTER_MODE_LINEAR) {
-        state |= amd::Sampler::StateFilterLinear;
-    }
-    switch (boundaryU) {
-    case HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE:
-        state |= amd::Sampler::StateAddressClampToEdge;
-        break;
-    case HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_BORDER:
-        state |= amd::Sampler::StateAddressClamp;
-        break;
-    case HSA_EXT_SAMPLER_ADDRESSING_MODE_REPEAT:
-        state |= amd::Sampler::StateAddressRepeat;
-        break;
-    case HSA_EXT_SAMPLER_ADDRESSING_MODE_MIRRORED_REPEAT:
-        state |= amd::Sampler::StateAddressMirroredRepeat;
-        break;
-    case HSA_EXT_SAMPLER_ADDRESSING_MODE_UNDEFINED:
-    default:
-        break;
-    }
-
-    gpu::HSAILProgram* prog = reinterpret_cast<gpu::HSAILProgram*>(userData);
-    if (prog->dev().settings().hsailDirectSRD_) {
-        char *pCPUbuf = new char[gpu::HsaSamplerObjectSize];
-        if (!pCPUbuf) {
-          assert(false);
-          return;
-        }
-        prog->dev().fillHwSampler(state, pCPUbuf, gpu::HsaSamplerObjectSize);
-        DmaMemoryCopy(userData, offset, pCPUbuf, gpu::HsaSamplerObjectSize);
-        delete pCPUbuf;
-    }
-    else {
-        gpu::Sampler* sampler = new gpu::Sampler(prog->dev());
-        if ((sampler != NULL) && sampler->create(state)) {
-            uint64_t    hwSrd = sampler->hwSrd();
-            DmaMemoryCopy(userData, offset, &hwSrd, sizeof(uint64_t));
-            prog->addSampler(sampler);
-        }
-    }
-    return;
-}
-
 namespace gpu {

 bool
@@ -1768,10 +1639,11 @@ HSAILProgram::HSAILProgram(Device& device)
    , llvmBinary_()
    , binaryElf_(NULL)
    , rawBinary_(NULL)
-    , globalStore_(NULL)
    , kernels_(NULL)
    , maxScratchRegs_(0)
    , isNull_(false)
+    , executable_(NULL)
+    , loaderContext_(this)
 {
    memset(&binOpts_, 0, sizeof(binOpts_));
    binOpts_.struct_size = sizeof(binOpts_);
@@ -1786,10 +1658,11 @@ HSAILProgram::HSAILProgram(NullDevice& device)
    , llvmBinary_()
    , binaryElf_(NULL)
    , rawBinary_(NULL)
-    , globalStore_(NULL)
    , kernels_(NULL)
    , maxScratchRegs_(0)
    , isNull_(true)
+    , executable_(NULL)
+    , loaderContext_(this)
 {
    memset(&binOpts_, 0, sizeof(binOpts_));
    binOpts_.struct_size = sizeof(binOpts_);
@@ -1817,7 +1690,9 @@ HSAILProgram::~HSAILProgram()
        }
    }
    releaseClBinary();
-    delete globalStore_;
+    if (executable_ != NULL) {
+        Executable::Destroy(executable_);
+    }
    delete kernels_;
 }

@@ -2163,21 +2038,46 @@ HSAILProgram::linkImpl(amd::option::Options* options)
        break;
    }
    case ACL_TYPE_CG:
-        hsaLoad = false;
        break;
    case ACL_TYPE_ISA:
-        hsaLoad = false;
        finalize = false;
        break;
    default:
        buildLog_ += "Error while BRIG Codegen phase: the binary is incomplete \n" ;
        return false;
    }
+    if (finalize) {
+        std::string fin_options(options->origOptionStr + hsailOptions());
+        // Append an option so that we can selectively enable a SCOption on CZ
+        // whenever IOMMUv2 is enabled.
+        if (dev().settings().svmFineGrainSystem_) {
+            fin_options.append(" -sc-xnack-iommu");
+        }
+        errorCode = aclCompile(dev().hsaCompiler(), binaryElf_,
+            fin_options.c_str(), ACL_TYPE_CG, ACL_TYPE_ISA, NULL);
+        buildLog_ += aclGetCompilerLog(dev().hsaCompiler());
+        if (errorCode != ACL_SUCCESS) {
+            LogError("Failed to finalize");
+            return false;
+        }
+    }
    // ACL_TYPE_CG stage is not performed for offline compilation
+    hsa_agent_t agent;
+    agent.handle = 1;
    if (!isNull() && hsaLoad) {
-        if (!_aclHsaLoader(dev().hsaCompiler(), binaryElf_, this, &AllocateGPUMemory,
-            &DmaMemoryCopy, &GetSamplerObjectParams, &InitializeSamplerObject)) {
-            buildLog_ += "Error while BRIG Codegen phase: loading BRIG globals in the ELF \n";
+        executable_ = Executable::Create(HSA_PROFILE_BASE, &loaderContext_, NULL);
+        if (executable_ == NULL) {
+            return false;
+        }
+        size_t size = 0;
+        hsa_code_object_t code_object;
+        code_object.handle = reinterpret_cast<uint64_t>(aclExtractSection(dev().hsaCompiler(), binaryElf_, &size, aclTEXT, &errorCode));
+        if (errorCode != ACL_SUCCESS) {
+            return false;
+        }
+        hsa_status_t status = executable_->LoadCodeObject(agent, code_object, NULL);
+        if (status != HSA_STATUS_SUCCESS) {
+            buildLog_ += "Error while HSA Loader phase: loading HSA Code Object \n";
            return false;
        }
    }
@@ -2187,7 +2087,7 @@ HSAILProgram::linkImpl(amd::option::Options* options)
        buildLog_ += "Error while Finalization phase: kernel names query from the ELF failed\n";
        return false;
    }
-    if (kernelNamesSize > 0) {
+    if (!isNull() && kernelNamesSize > 0) {
        char* kernelNames = new char[kernelNamesSize];
        errorCode = aclQueryInfo(dev().hsaCompiler(), binaryElf_, RT_KERNEL_NAMES, NULL, kernelNames, &kernelNamesSize);
        if (errorCode != ACL_SUCCESS) {
@@ -2202,12 +2102,18 @@ HSAILProgram::linkImpl(amd::option::Options* options)
        for (it; it != vKernels.end(); ++it) {
            std::string kernelName = *it;
            HSAILKernel *aKernel = new HSAILKernel(kernelName, this, options->origOptionStr + hsailOptions());
-            if (!aKernel->init(finalize)) {
+            kernels()[kernelName] = aKernel;
+            amd::hsa::loader::Symbol *sym = executable_->GetSymbol("", Kernel::openclMangledName(kernelName).c_str(), agent, 0);
+            if (!sym) {
+                LogError("Failed to get kernel ISA code");
+                return false;
+            }
+            if (!aKernel->init(sym, false)) {
+                LogError("Failed to init HSAILKernel");
                return false;
            }
            buildLog_ += aKernel->buildLog();
            aKernel->setUniformWorkGroupSize(options->oVariables->UniformWorkGroupSize);
-            kernels()[kernelName] = aKernel;
            dynamicParallelism |= aKernel->dynamicParallelism();
            // Find max scratch regs used in the program. It's used for scratch buffer preallocation
            // with dynamic parallelism, since runtime doesn't know which child kernel will be called
@@ -2333,5 +2239,208 @@ HSAILProgram::info(const char * str) {
    return info_;
 }

+hsa_isa_t ORCAHSALoaderContext::IsaFromName(const char *name) {
+    hsa_isa_t isa = {0};
+    if (!strcmp(Gfx700, name)) { isa.handle = gfx700; return isa; }
+    if (!strcmp(Gfx701, name)) { isa.handle = gfx701; return isa; }
+    if (!strcmp(Gfx800, name)) { isa.handle = gfx800; return isa; }
+    if (!strcmp(Gfx801, name)) { isa.handle = gfx801; return isa; }
+    if (!strcmp(Gfx810, name)) { isa.handle = gfx810; return isa; }
+    if (!strcmp(Gfx900, name)) { isa.handle = gfx900; return isa; }
+    return isa;
+}
+
+bool ORCAHSALoaderContext::IsaSupportedByAgent(hsa_agent_t agent, hsa_isa_t isa) {
+    switch (program_->dev().hwInfo()->gfxipVersion_) {
+    default:
+        LogError("Unsupported gfxip version");
+        return false;
+    case gfx700:
+    case gfx701:
+    case gfx702:
+        // gfx701 only differs from gfx700 by faster fp operations and can be loaded on either device.
+        return isa.handle == gfx700 || isa.handle == gfx701;
+    case gfx800:
+        if (ED_ATI_CAL_MACHINE_ICELAND_ISA == program_->dev().hwInfo()->machine_ ||
+            ED_ATI_CAL_MACHINE_TONGA_ISA == program_->dev().hwInfo()->machine_ ) {
+            return isa.handle == gfx800;
+        } else {
+            // gfx800 has only sgrps limited and can be loaded on later chips.
+            return isa.handle == gfx800 || isa.handle == gfx801;
+        }
+    case gfx900:
+        return isa.handle == gfx900;
+    }
+}
+
+void* ORCAHSALoaderContext::SegmentAlloc(amdgpu_hsa_elf_segment_t segment,
+    hsa_agent_t agent, size_t size, size_t align, bool zero) {
+    assert(size);
+    assert(align);
+    switch (segment) {
+    case AMDGPU_HSA_SEGMENT_GLOBAL_PROGRAM:
+    case AMDGPU_HSA_SEGMENT_GLOBAL_AGENT:
+    case AMDGPU_HSA_SEGMENT_READONLY_AGENT:
+        return AgentGlobalAlloc(agent, size, align, zero);
+    case AMDGPU_HSA_SEGMENT_CODE_AGENT:
+        return KernelCodeAlloc(agent, size, align, zero);
+    default:
+        assert(false); return 0;
+    }
+}
+
+bool ORCAHSALoaderContext::SegmentCopy(amdgpu_hsa_elf_segment_t segment,
+    hsa_agent_t agent, void* dst, size_t offset, const void* src, size_t size) {
+    switch (segment) {
+    case AMDGPU_HSA_SEGMENT_GLOBAL_PROGRAM:
+    case AMDGPU_HSA_SEGMENT_GLOBAL_AGENT:
+    case AMDGPU_HSA_SEGMENT_READONLY_AGENT:
+      return AgentGlobalCopy(dst, offset, src, size);
+    case AMDGPU_HSA_SEGMENT_CODE_AGENT:
+      return KernelCodeCopy(dst, offset, src, size);
+    default:
+      assert(false); return false;
+    }
+}
+
+void ORCAHSALoaderContext::SegmentFree(amdgpu_hsa_elf_segment_t segment,
+    hsa_agent_t agent, void* seg, size_t size) {
+    switch (segment) {
+    case AMDGPU_HSA_SEGMENT_GLOBAL_PROGRAM:
+    case AMDGPU_HSA_SEGMENT_GLOBAL_AGENT:
+    case AMDGPU_HSA_SEGMENT_READONLY_AGENT: AgentGlobalFree(seg, size); break;
+    case AMDGPU_HSA_SEGMENT_CODE_AGENT: KernelCodeFree(seg, size); break;
+    default:
+        assert(false); return;
+    }
+}
+
+void* ORCAHSALoaderContext::SegmentAddress(amdgpu_hsa_elf_segment_t segment,
+    hsa_agent_t agent, void* seg, size_t offset) {
+    assert(seg);
+    switch (segment) {
+    case AMDGPU_HSA_SEGMENT_GLOBAL_PROGRAM:
+    case AMDGPU_HSA_SEGMENT_GLOBAL_AGENT:
+    case AMDGPU_HSA_SEGMENT_READONLY_AGENT: {
+        gpu::Memory *gpuMem = reinterpret_cast<gpu::Memory*>(seg);
+        return reinterpret_cast<void*>(gpuMem->vmAddress());
+    }
+    case AMDGPU_HSA_SEGMENT_CODE_AGENT: return (char*) seg + offset;
+    default:
+        assert(false); return NULL;
+    }
+}
+
+hsa_status_t ORCAHSALoaderContext::SamplerCreate(
+    hsa_agent_t agent,
+    const hsa_ext_sampler_descriptor_t *sampler_descriptor,
+    hsa_ext_sampler_t *sampler_handle) {
+    if (!agent.handle) {
+        return HSA_STATUS_ERROR_INVALID_AGENT;
+    }
+    if (!sampler_descriptor || !sampler_handle) {
+        return HSA_STATUS_ERROR_INVALID_ARGUMENT;
+    }
+    uint32_t state = 0;
+    switch (sampler_descriptor->coordinate_mode) {
+        case HSA_EXT_SAMPLER_COORDINATE_MODE_UNNORMALIZED: state = amd::Sampler::StateNormalizedCoordsFalse; break;
+        case HSA_EXT_SAMPLER_COORDINATE_MODE_NORMALIZED:   state = amd::Sampler::StateNormalizedCoordsTrue; break;
+        default:
+            assert(false);
+            return HSA_STATUS_ERROR_INVALID_ARGUMENT;
+    }
+    switch (sampler_descriptor->filter_mode) {
+        case HSA_EXT_SAMPLER_FILTER_MODE_NEAREST: state |= amd::Sampler::StateFilterNearest; break;
+        case HSA_EXT_SAMPLER_FILTER_MODE_LINEAR:  state |= amd::Sampler::StateFilterLinear; break;
+        default:
+            assert(false);
+            return HSA_STATUS_ERROR_INVALID_ARGUMENT;
+
+    }
+    switch (sampler_descriptor->address_mode) {
+        case HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE:   state |= amd::Sampler::StateAddressClampToEdge; break;
+        case HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_BORDER: state |= amd::Sampler::StateAddressClamp; break;
+        case HSA_EXT_SAMPLER_ADDRESSING_MODE_REPEAT:          state |= amd::Sampler::StateAddressRepeat; break;
+        case HSA_EXT_SAMPLER_ADDRESSING_MODE_MIRRORED_REPEAT: state |= amd::Sampler::StateAddressMirroredRepeat; break;
+        case HSA_EXT_SAMPLER_ADDRESSING_MODE_UNDEFINED:
+        default:
+            assert(false);
+            return HSA_STATUS_ERROR_INVALID_ARGUMENT;
+    }
+    assert(!program_->dev().settings().hsailDirectSRD_);
+    gpu::Sampler* sampler = new gpu::Sampler(program_->dev());
+    if (!sampler || !sampler->create(state)) {
+        delete sampler;
+        return HSA_STATUS_ERROR;
+    }
+    program_->addSampler(sampler);
+    sampler_handle->handle = sampler->hwSrd();
+    return HSA_STATUS_SUCCESS;
+}
+
+hsa_status_t ORCAHSALoaderContext::SamplerDestroy(
+    hsa_agent_t agent, hsa_ext_sampler_t sampler_handle) {
+    if (!agent.handle) {
+        return HSA_STATUS_ERROR_INVALID_AGENT;
+    }
+    if (!sampler_handle.handle) {
+        return HSA_STATUS_ERROR_INVALID_ARGUMENT;
+    }
+    return HSA_STATUS_SUCCESS;
+}
+
+void* ORCAHSALoaderContext::CpuMemAlloc(size_t size, size_t align, bool zero) {
+    assert(size);
+    assert(align);
+    assert(sizeof(void*) == 8 || sizeof(void*) == 4);
+    void* ptr = amd::Os::alignedMalloc(size, align);
+    if (zero) {
+        memset(ptr, 0, size);
+    }
+    return ptr;
+}
+
+bool ORCAHSALoaderContext::CpuMemCopy(void *dst, size_t offset, const void* src, size_t size) {
+  if (!dst || !src || dst == src) {
+      return false;
+  }
+  if (0 == size) {
+      return true;
+  }
+  amd::Os::fastMemcpy((char*)dst + offset, src, size);
+  return true;
+}
+
+void* ORCAHSALoaderContext::GpuMemAlloc(size_t size, size_t align, bool zero) {
+    assert(size);
+    assert(align);
+    assert(sizeof(void*) == 8 || sizeof(void*) == 4);
+    gpu::Memory* mem = new gpu::Memory(program_->dev(), amd::alignUp(size, align));
+    if (!mem || !mem->create(gpu::Resource::Local)) {
+        delete mem;
+        return NULL;
+    }
+    assert(program_->dev().xferQueue());
+    if (zero) {
+        char pattern = 0;
+        program_->dev().xferMgr().fillBuffer(*mem, &pattern, sizeof(pattern), amd::Coord3D(0), amd::Coord3D(size));
+    }
+    program_->addGlobalStore(mem);
+    program_->setGlobalVariableTotalSize(program_->globalVariableTotalSize() + size);
+    return mem;
+}
+
+bool ORCAHSALoaderContext::GpuMemCopy(void *dst, size_t offset, const void *src, size_t size) {
+    if (!dst || !src || dst == src) {
+        return false;
+    }
+    if (0 == size) {
+        return true;
+    }
+    assert(program_->dev().xferQueue());
+    gpu::Memory* mem = reinterpret_cast<gpu::Memory*>(dst);
+    return program_->dev().xferMgr().writeBuffer(src, *mem, amd::Coord3D(offset), amd::Coord3D(size), true);
+    return true;
+}

 } // namespace gpu
@@ -7,11 +7,18 @@

 #include "device/gpu/gpukernel.hpp"
 #include "device/gpu/gpubinary.hpp"
+#include "amd_hsa_loader.hpp"

 namespace amd {
 namespace option {
 class Options;
 } // option
+namespace hsa {
+namespace loader {
+class Executable;
+class Context;
+} // loader
+} // hsa
 } // amd

 //! \namespace gpu GPU Device Implementation
@@ -369,6 +376,121 @@ private:
    gpu::Memory*    glbData_;   //!< Global data store
 };

+using namespace amd::hsa::loader;
+class HSAILProgram;
+
+class ORCAHSALoaderContext final: public Context {
+public:
+    ORCAHSALoaderContext(HSAILProgram* program): program_(program) {}
+
+    virtual ~ORCAHSALoaderContext() {}
+
+    hsa_isa_t IsaFromName(const char *name) override;
+
+    bool IsaSupportedByAgent(hsa_agent_t agent, hsa_isa_t isa) override;
+
+    void* SegmentAlloc(amdgpu_hsa_elf_segment_t segment,
+        hsa_agent_t agent, size_t size, size_t align, bool zero) override;
+
+    bool SegmentCopy(amdgpu_hsa_elf_segment_t segment,
+        hsa_agent_t agent, void* dst, size_t offset,
+        const void* src, size_t size) override;
+
+    void SegmentFree(amdgpu_hsa_elf_segment_t segment,
+        hsa_agent_t agent, void* seg, size_t size = 0) override;
+
+    void* SegmentAddress(amdgpu_hsa_elf_segment_t segment,
+        hsa_agent_t agent, void* seg, size_t offset) override;
+
+    bool ImageExtensionSupported() override { return false; }
+
+    hsa_status_t ImageCreate(
+        hsa_agent_t agent,
+        hsa_access_permission_t image_permission,
+        const hsa_ext_image_descriptor_t *image_descriptor,
+        const void *image_data,
+        hsa_ext_image_t *image_handle) override {
+        // not supported
+        assert(false);
+        return HSA_STATUS_ERROR;
+    }
+
+    hsa_status_t ImageDestroy(
+        hsa_agent_t agent, hsa_ext_image_t image_handle) override {
+        // not supported
+        assert(false);
+        return HSA_STATUS_ERROR;
+    }
+
+    hsa_status_t SamplerCreate(
+        hsa_agent_t agent,
+        const hsa_ext_sampler_descriptor_t *sampler_descriptor,
+        hsa_ext_sampler_t *sampler_handle) override;
+
+    //! All samplers are owned by HSAILProgram and are deleted in its destructor.
+    hsa_status_t SamplerDestroy(
+        hsa_agent_t agent, hsa_ext_sampler_t sampler_handle) override;
+
+private:
+
+    void* AgentGlobalAlloc(
+        hsa_agent_t agent, size_t size, size_t align, bool zero) {
+        return GpuMemAlloc(size, align, zero);
+    }
+
+    bool AgentGlobalCopy(void *dst, size_t offset, const void *src, size_t size) {
+        return GpuMemCopy(dst, offset, src, size);
+    }
+
+    void AgentGlobalFree(void *ptr, size_t size) {
+        GpuMemFree(ptr, size);
+    }
+
+    void* KernelCodeAlloc(
+        hsa_agent_t agent, size_t size, size_t align, bool zero) {
+        return CpuMemAlloc(size, align, zero);
+    }
+
+    bool KernelCodeCopy(void *dst, size_t offset, const void *src, size_t size) {
+        return CpuMemCopy(dst, offset, src, size);
+    }
+
+    void KernelCodeFree(void *ptr, size_t size) {
+        CpuMemFree(ptr, size);
+    }
+
+    void* CpuMemAlloc(size_t size, size_t align, bool zero);
+
+    bool CpuMemCopy(void *dst, size_t offset, const void* src, size_t size);
+
+    void CpuMemFree(void *ptr, size_t size) {
+        amd::Os::alignedFree(ptr);
+    }
+
+    void* GpuMemAlloc(size_t size, size_t align, bool zero);
+
+    bool GpuMemCopy(void *dst, size_t offset, const void *src, size_t size);
+
+    void GpuMemFree(void *ptr, size_t size = 0) {
+        delete reinterpret_cast<gpu::Memory*>(ptr);
+    }
+
+    ORCAHSALoaderContext(const ORCAHSALoaderContext &c);
+
+    ORCAHSALoaderContext& operator=(const ORCAHSALoaderContext &c);
+
+    enum gfx_handle {
+        gfx700 = 700,
+        gfx701 = 701,
+        gfx702 = 702,
+        gfx800 = 800,
+        gfx801 = 801,
+        gfx810 = 810,
+        gfx900 = 900
+    };
+
+    gpu::HSAILProgram* program_;
+};

 //! \class HSAIL program
 class HSAILProgram : public device::Program
@@ -385,9 +507,9 @@ public:
    aclBinary* binaryElf() const {
        return static_cast<aclBinary*>(binaryElf_); }

-    void setGlobalStore(Memory* mem) { globalStore_ = mem; }
+    void addGlobalStore(Memory* mem) { globalStores_.push_back(mem); }

-    const Memory* globalStore() const { return globalStore_; }
+    const std::vector<Memory*>& globalStores() const { return globalStores_; }

    //! Return a typecasted GPU device
    gpu::Device& dev()
@@ -497,11 +619,13 @@ private:
    aclBinary*      binaryElf_;     //!< Binary for the new compiler library
    void*           rawBinary_;     //!< Pointer to the raw binary
    aclBinaryOptions binOpts_;      //!< Binary options to create aclBinary
-    Memory*         globalStore_;   //!< Global memory for the program
+    std::vector<Memory*>         globalStores_;   //!< Global memory for the program
    Memory*         kernels_;       //!< Table with kernel object pointers
    uint    maxScratchRegs_;    //!< Maximum number of scratch regs used in the program by individual kernel
    std::list<Sampler*>   staticSamplers_;    //!< List od internal static samplers
    bool            isNull_;        //!< Null program no memory allocations
+    amd::hsa::loader::Executable* executable_;    //!< Executable for HSA Loader
+    ORCAHSALoaderContext loaderContext_;    //!< Context for HSA Loader
 };

 /*@}*/} // namespace gpu
@@ -14,6 +14,7 @@
 #include <sstream>
 #include <iostream>
 #include <ctime>
+#include "amd_hsa_loader.hpp"

 namespace gpu {

@@ -137,54 +138,36 @@ NullKernel::siCreateHwInfo(const void* shader, AMUabiAddEncoding& encoding)
 }

 bool
-HSAILKernel::aqlCreateHWInfo(const void* shader, size_t shaderSize)
+HSAILKernel::aqlCreateHWInfo(amd::hsa::loader::Symbol *sym)
 {
-    // Copy the shader_isa into a buffer
-    hwMetaData_ = new char[shaderSize];
-    if (hwMetaData_ == NULL) {
+    if (!sym) {
        return false;
    }
-    memcpy(hwMetaData_, shader, shaderSize);
-
-    SC_SI_HWSHADER_CS* siMetaData = reinterpret_cast<SC_SI_HWSHADER_CS*>(hwMetaData_);
-
-    // Code to patch the pointers in the shader object.
-    // Must be preferably done in the compiler library
-    size_t offset = siMetaData->common.uSizeInBytes;
-    if (siMetaData->common.u32PvtDataSizeInBytes > 0) {
-        siMetaData->common.pPvtData =
-            reinterpret_cast<SC_BYTE *>(
-            reinterpret_cast<char *>(siMetaData) + offset);
-        offset += siMetaData->common.u32PvtDataSizeInBytes;
+    uint64_t akc_addr = 0;
+    if (!sym->GetInfo(HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, reinterpret_cast<void*>(&akc_addr))) {
+        return false;
    }
-    if (siMetaData->common.codeLenInByte > 0) {
-        siMetaData->common.hShaderMemHandle =
-            reinterpret_cast<char *>(siMetaData) + offset;
-        offset += siMetaData->common.codeLenInByte;
+    amd_kernel_code_t *akc = reinterpret_cast<amd_kernel_code_t*>(akc_addr);
+    cpuAqlCode_ = akc;
+    if (!sym->GetInfo(HSA_EXT_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT_SIZE, reinterpret_cast<void*>(&codeSize_))) {
+        return false;
    }
-
-    char* headerBaseAddress =
-        reinterpret_cast<char*>(siMetaData->common.hShaderMemHandle);
-    amd_kernel_code_t* akc = reinterpret_cast<amd_kernel_code_t*>(
-        headerBaseAddress);
-
-    address codeStartAddress = reinterpret_cast<address>(akc);
-    address codeEndAddress = reinterpret_cast<address>(akc) + siMetaData->common.codeLenInByte;
-    codeSize_ = codeEndAddress - codeStartAddress;
-    code_ = new gpu::Memory(dev(), amd::alignUp(codeSize_, gpu::ConstBuffer::VectorSize));
-
+    size_t akc_align = 0;
+    if (!sym->GetInfo(HSA_EXT_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT_ALIGN, reinterpret_cast<void*>(&akc_align))) {
+        return false;
+    }
+    code_ = new gpu::Memory(dev(), amd::alignUp(codeSize_, akc_align));
    // Initialize kernel ISA code
-    if ((code_ != NULL) && code_->create(Resource::Shader)) {
+    if (code_ && code_->create(Resource::Shader)) {
        address cpuCodePtr = static_cast<address>(code_->map(NULL, Resource::WriteOnly));
        // Copy only amd_kernel_code_t
-        memcpy(cpuCodePtr, codeStartAddress, codeSize_);
+        memcpy(cpuCodePtr,  reinterpret_cast<address>(akc), codeSize_);
        code_->unmap(NULL);
    }
    else {
        LogError("Failed to allocate ISA code!");
        return false;
    }
-    cpuAqlCode_ = akc;

    assert((akc->workitem_private_segment_byte_size & 3) == 0 &&
        "Scratch must be DWORD aligned");
@@ -3293,10 +3293,10 @@ VirtualGPU::processMemObjectsHSA(
        }
    }

-    if (hsaKernel.prog().globalStore() != NULL) {
+    for (gpu::Memory* mem : hsaKernel.prog().globalStores()) {
        const static bool IsReadOnly = false;
        // Validate global store for a dependency in the queue
-        memoryDependency().validate(*this, hsaKernel.prog().globalStore(), IsReadOnly);
+        memoryDependency().validate(*this, mem, IsReadOnly);
    }
 }