diff --git a/projects/clr/rocclr/compiler/lib/amdoclcl.def.in b/projects/clr/rocclr/compiler/lib/amdoclcl.def.in
index e63d96bfe5..a4f923e8c8 100644
--- a/projects/clr/rocclr/compiler/lib/amdoclcl.def.in
+++ b/projects/clr/rocclr/compiler/lib/amdoclcl.def.in
@@ -50,6 +50,3 @@ aclJITObjectImageDisassembleKernel
 #endif
 aclJITObjectImageIterateSymbols
 aclJITObjectImageGetGlobalsSize
-#if defined(WITH_TARGET_HSAIL)
-_aclHsaLoader
-#endif
diff --git a/projects/clr/rocclr/compiler/lib/amdoclcl.map.in b/projects/clr/rocclr/compiler/lib/amdoclcl.map.in
index 981529d8c9..a56ae07c08 100644
--- a/projects/clr/rocclr/compiler/lib/amdoclcl.map.in
+++ b/projects/clr/rocclr/compiler/lib/amdoclcl.map.in
@@ -53,9 +53,6 @@ global:
 #endif
   aclJITObjectImageIterateSymbols;
   aclJITObjectImageGetGlobalsSize;
-#if defined(WITH_TARGET_HSAIL)
-  _aclHsaLoader;
-#endif
 #if defined(OPENCL_MAINLINE)
 local:
   *;
diff --git a/projects/clr/rocclr/compiler/lib/backends/common/v0_8/if_acl.cpp b/projects/clr/rocclr/compiler/lib/backends/common/v0_8/if_acl.cpp
index b910ad34c6..100b4397f3 100644
--- a/projects/clr/rocclr/compiler/lib/backends/common/v0_8/if_acl.cpp
+++ b/projects/clr/rocclr/compiler/lib/backends/common/v0_8/if_acl.cpp
@@ -2092,11 +2092,17 @@ if_aclGetDeviceBinary(aclCompiler *cl,
     size_t *size,
     acl_error *error_code)
 {
-  const oclBIFSymbolStruct* symbol = findBIF30SymStruct(symISABinary);
-  assert(symbol && "symbol not found");
-  std::string kernelName = symbol->str[PRE] + std::string(kernel) + symbol->str[POST];
-  return cl->clAPI.extSym(cl, bin, size,
-      symbol->sections[0], kernelName.c_str(), error_code);
+#ifdef WITH_TARGET_HSAIL
+  if (isHSAILTarget(bin->target)) {
+    return cl->clAPI.extSec(cl, bin, size, aclTEXT, error_code);
+  } else
+#endif
+  {
+    const oclBIFSymbolStruct* sym = findBIF30SymStruct(symISABinary);
+    assert(sym && "symbol not found");
+    std::string name = sym->str[PRE] + std::string(kernel) + sym->str[POST];
+    return cl->clAPI.extSym(cl, bin, size, sym->sections[0], name.c_str(), error_code);
+  }
 }
 
 acl_error  ACL_API_ENTRY
diff --git a/projects/clr/rocclr/compiler/lib/utils/v0_8/libUtils.cpp b/projects/clr/rocclr/compiler/lib/utils/v0_8/libUtils.cpp
index 963b5ab158..0342406dbd 100644
--- a/projects/clr/rocclr/compiler/lib/utils/v0_8/libUtils.cpp
+++ b/projects/clr/rocclr/compiler/lib/utils/v0_8/libUtils.cpp
@@ -17,6 +17,12 @@ extern aclBinary* constructBinary(size_t struct_version,
     const aclTargetInfo *target,
     const aclBinaryOptions *opts);
 
+static const std::string sgfx700 = "AMD:AMDGPU:7:0:0";
+static const std::string sgfx701 = "AMD:AMDGPU:7:0:1";
+static const std::string sgfx800 = "AMD:AMDGPU:8:0:0";
+static const std::string sgfx801 = "AMD:AMDGPU:8:0:1";
+static const std::string sgfx900 = "AMD:AMDGPU:9:0:0";
+
 // Utility function to set a flag in option structure
 // of the aclDevCaps.
 void
@@ -501,6 +507,54 @@ unsigned getChipEnum(const aclTargetInfo *target)
   return Mapping.chip_enum;
 }
 
+/*! Function that returns isa type name (compute capability) from
+ *the TargetMapping table for the specific target device id.
+ */
+const std::string &getIsaTypeName(const aclTargetInfo *target)
+{
+  const TargetMapping& Mapping = getTargetMapping(*target);
+  switch (Mapping.family_enum) {
+    default: return sgfx700;
+    case FAMILY_KV:
+      switch (Mapping.chip_enum) {
+        default: return sgfx700;
+        case KV_SPECTRE_A0:
+        case KV_SPOOKY_A0:
+        case KB_KALINDI_A0:
+        // ???
+        case ML_GODAVARI_A0: return sgfx700;
+      }
+    case FAMILY_CI:
+      switch (Mapping.chip_enum) {
+        default: return sgfx700;
+        case CI_BONAIRE_M_A0:
+        case CI_BONAIRE_M_A1: return sgfx700;
+        case CI_HAWAII_P_A0: return sgfx701;
+        case CI_TIRAN_P_A0:
+        case CI_MAUI_P_A0: return sgfx700;
+      }
+    case FAMILY_VI:
+      switch (Mapping.chip_enum) {
+        default: return sgfx800;
+        case VI_ICELAND_M_A0:
+        case VI_TONGA_P_A0: return sgfx800;
+        case VI_ELLESMERE_P_A0:
+        case VI_BAFFIN_M_A0:
+        case VI_FIJI_P_A0: return sgfx801;
+      }
+    case FAMILY_CZ:
+      switch (Mapping.chip_enum) {
+        default: return sgfx801;
+        case CARRIZO_A0: return sgfx801;
+      }
+    case FAMILY_AI:
+      switch (Mapping.chip_enum) {
+        default: return sgfx900;
+        case AI_GREENLAND_P_A0: return sgfx900;
+      }
+  }
+}
+
 void
 appendLogToCL(aclCompiler *cl, const std::string &logStr)
 {
diff --git a/projects/clr/rocclr/compiler/lib/utils/v0_8/libUtils.h b/projects/clr/rocclr/compiler/lib/utils/v0_8/libUtils.h
index d90ee94a2e..d3a068573d 100644
--- a/projects/clr/rocclr/compiler/lib/utils/v0_8/libUtils.h
+++ b/projects/clr/rocclr/compiler/lib/utils/v0_8/libUtils.h
@@ -36,16 +36,19 @@ initElfDeviceCaps(aclBinary *elf);
 void
 appendLogToCL(aclCompiler *cl, const std::string &logStr);
 
-const char *getDeviceName(const aclTargetInfo &Target);
+const char *getDeviceName(const aclTargetInfo &target);
 
 // Select the correct library from the target information.
-amd::LibrarySelector getLibraryType(const aclTargetInfo *Target);
+amd::LibrarySelector getLibraryType(const aclTargetInfo *target);
 
 // get family_enum from the target information.
-unsigned getFamilyEnum(const aclTargetInfo *Target);
+unsigned getFamilyEnum(const aclTargetInfo *target);
 
 // get chip_enum from the target information.
-unsigned getChipEnum(const aclTargetInfo *Target);
+unsigned getChipEnum(const aclTargetInfo *target);
+
+// get isa type name (compute capability) from the target information.
+const std::string &getIsaTypeName(const aclTargetInfo *target);
 
 // Create a copy of an ELF and duplicate all sections/symbols
 aclBinary*
diff --git a/projects/clr/rocclr/runtime/device/device.hpp b/projects/clr/rocclr/runtime/device/device.hpp
index 6f229b7ddf..6bff20f27a 100644
--- a/projects/clr/rocclr/runtime/device/device.hpp
+++ b/projects/clr/rocclr/runtime/device/device.hpp
@@ -940,6 +940,8 @@ public:
     //! Return the build log
     const std::string& buildLog() const { return buildLog_; }
 
+    static std::string openclMangledName(const std::string& name) { return "&__OpenCL_" + name + "_kernel"; }
+
 protected:
     std::string     name_;              //!< kernel name
     WorkGroupInfo   workGroupInfo_;     //!< device kernel info structure
diff --git a/projects/clr/rocclr/runtime/device/gpu/gpudefs.hpp b/projects/clr/rocclr/runtime/device/gpu/gpudefs.hpp
index 0fb0e9b133..d9e8e18b6c 100644
--- a/projects/clr/rocclr/runtime/device/gpu/gpudefs.hpp
+++ b/projects/clr/rocclr/runtime/device/gpu/gpudefs.hpp
@@ -128,6 +128,13 @@ static const AMDDeviceInfo DeviceInfo[] = {
 /* CAL_TARGET_GREENLAND */  { ED_ATI_CAL_MACHINE_GREENLAND_ISA, "",  "",   4, 16, 1, 256, 64 * Ki, 32, 900 },
 };
 
+static const char* Gfx700 = "AMD:AMDGPU:7:0:0";
+static const char* Gfx701 = "AMD:AMDGPU:7:0:1";
+static const char* Gfx800 = "AMD:AMDGPU:8:0:0";
+static const char* Gfx801 = "AMD:AMDGPU:8:0:1";
+static const char* Gfx810 = "AMD:AMDGPU:8:1:0";
+static const char* Gfx900 = "AMD:AMDGPU:9:0:0";
+
 // Supported OpenCL versions
 enum OclVersion {
     OpenCL10,
diff --git a/projects/clr/rocclr/runtime/device/gpu/gpukernel.cpp b/projects/clr/rocclr/runtime/device/gpu/gpukernel.cpp
index e2d61d0982..a5f3e8af4f 100644
--- a/projects/clr/rocclr/runtime/device/gpu/gpukernel.cpp
+++ b/projects/clr/rocclr/runtime/device/gpu/gpukernel.cpp
@@ -3552,12 +3552,12 @@ HSAILKernel::~HSAILKernel()
 }
 
 bool
-HSAILKernel::init(bool finalize)
+HSAILKernel::init(amd::hsa::loader::Symbol *sym, bool finalize)
 {
     acl_error error;
-    const oclBIFSymbolStruct* sym = findBIF30SymStruct(symOpenclKernel);
-    assert(sym && "symbol not found");
-    std::string openClKernelName(std::string("&") + sym->str[PRE] + name() + sym->str[POST]);
+    const oclBIFSymbolStruct* bifSym = findBIF30SymStruct(symOpenclKernel);
+    assert(bifSym && "symbol not found");
+    std::string openClKernelName(std::string("&") + bifSym->str[PRE] + name() + bifSym->str[POST]);
     //compile kernel down to ISA
     if (finalize) {
         std::string options(compileOptions_.c_str());
@@ -3578,19 +3578,10 @@ HSAILKernel::init(bool finalize)
             return false;
         }
     }
-    // Get the ISA out
-    size_t  size_isa;
-    void*   shader_isa = NULL;
-    shader_isa = const_cast<void *>(aclGetDeviceBinary(dev().hsaCompiler(),
-        prog().binaryElf(), openClKernelName.c_str(), &size_isa, &error));
-    if (shader_isa == NULL) {
-        LogError("Failed find the ISA");
-        return false;
-    }
 
     // Allocate HW resources for the real program only
     if (!prog().isNull()) {
-        aqlCreateHWInfo(shader_isa, size_isa);
+        aqlCreateHWInfo(sym);
     }
 
     // Pull out metadata from the ELF
@@ -4131,8 +4122,8 @@ HSAILKernel::loadArguments(
 
     memList.push_back(cb);
     memList.push_back(gpuAqlCode());
-    if (NULL != prog().globalStore()) {
-        memList.push_back(prog().globalStore());
+    for (gpu::Memory * mem : prog().globalStores()) {
+        memList.push_back(mem);
     }
     if (AMD_HSA_BITS_GET(cpuAqlCode_->kernel_code_properties,
           AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_QUEUE_PTR)) {
diff --git a/projects/clr/rocclr/runtime/device/gpu/gpukernel.hpp b/projects/clr/rocclr/runtime/device/gpu/gpukernel.hpp
index 7147128ff9..870f0313de 100644
--- a/projects/clr/rocclr/runtime/device/gpu/gpukernel.hpp
+++ b/projects/clr/rocclr/runtime/device/gpu/gpukernel.hpp
@@ -17,6 +17,15 @@
 #include "device/gpu/gpuprintf.hpp"
 #include "device/gpu/gpuwavelimiter.hpp"
 #include "hsa.h"
+
+namespace amd {
+namespace hsa {
+namespace loader {
+class Symbol;
+} // loader
+} // hsa
+} // amd
+
 //! \namespace gpu GPU Device Implementation
 namespace gpu {
 
@@ -847,7 +856,7 @@ public:
 
     //! Initializes the metadata required for this kernel,
     //! finalizes the kernel if needed
-    bool init(bool finalize = true);
+    bool init(amd::hsa::loader::Symbol *sym, bool finalize = false);
 
     //! Returns true if memory is valid for execution
     virtual bool validateMemory(uint idx, amd::Memory* amdMem) const;
@@ -927,10 +936,7 @@ private:
     HSAILKernel& operator=(const HSAILKernel&);
 
     //! Creates AQL kernel HW info
-    bool aqlCreateHWInfo(
-        const void* kernel,     //!< Kernel's packed binary info and code
-        size_t kernelSize       //!< Size of the kernel's packed binary
-        );
+    bool aqlCreateHWInfo(amd::hsa::loader::Symbol *sym);
 
     //! Initializes arguments_ and the abstraction layer kernel parameters
     void initArgList(
diff --git a/projects/clr/rocclr/runtime/device/gpu/gpuprogram.cpp b/projects/clr/rocclr/runtime/device/gpu/gpuprogram.cpp
index cd38fbae9c..5780243b8b 100644
--- a/projects/clr/rocclr/runtime/device/gpu/gpuprogram.cpp
+++ b/projects/clr/rocclr/runtime/device/gpu/gpuprogram.cpp
@@ -18,135 +18,6 @@
 #include "hsa.h"
 #include "hsa_ext_image.h"
 
-extern "C" bool
-ACL_API_ENTRY _aclHsaLoader(
-    aclCompiler* compiler_handle,
-    aclBinary* bin,
-    void* userData,
-    void (*allocateGPUMemory)(void* userData, size_t size, uint64_t* GPUMemory),
-    bool (*DmaMemoryCopy)(void* userData, uint64_t offset, const void* pSrc, size_t size),
-    void (*getSamplerObjectParam)(uint32_t* size, uint32_t* alignment),
-    void (*initializeSamplerObject)(void* userData, uint64_t offset, bool unnormalize,
-    uint8_t fltr, uint8_t addrU, uint8_t addrV, uint8_t addrW));
-
-bool
-DmaMemoryCopy(void* userData, uint64_t offset, const void* pSrc, size_t size)
-{
-    gpu::HSAILProgram* prog = reinterpret_cast<gpu::HSAILProgram*>(userData);
-    gpu::Memory* mem = const_cast<gpu::Memory*>(prog->globalStore());
-    if (mem == NULL) {
-        return false;
-    }
-    size_t maxCopySize = prog->globalVariableTotalSize();
-    if (maxCopySize >= size) {
-        maxCopySize = size;
-    }
-    amd::Coord3D origin(offset);
-    amd::Coord3D region(maxCopySize);
-    // memcpy mode
-    if (pSrc) {
-        const bool Entire  = true;
-        return prog->dev().xferMgr().writeBuffer(pSrc, *mem, origin, region, Entire);
-    }
-    // memset mode
-    else {
-        char pattern = 0;
-        return prog->dev().xferMgr().fillBuffer(*mem, &pattern, sizeof(pattern),
-        origin, region);
-    }
-}
-
-void
-AllocateGPUMemory(void* userData, size_t size, uint64_t* GPUMemory)
-{
-    gpu::Memory* mem = NULL;
-    void*   cpuPtr = NULL;
-    gpu::HSAILProgram* prog = reinterpret_cast<gpu::HSAILProgram*>(userData);
-
-    mem = new gpu::Memory(prog->dev(), amd::alignUp(size, gpu::ConstBuffer::VectorSize));
-
-    // Initialize constant buffer
-    if ((mem == NULL) || !mem->create(gpu::Resource::Local)) {
-        delete mem;
-        *GPUMemory = 0;
-        return;
-    }
-    *GPUMemory = mem->vmAddress();
-    prog->setGlobalStore(mem);
-    prog->setGlobalVariableTotalSize(size);
-}
-
-void
-GetSamplerObjectParams(uint32_t* size, uint32_t* alignment)
-{
-    if (GPU_DIRECT_SRD) {
-        *size = gpu::HsaSamplerObjectSize;
-        *alignment = gpu::HsaSamplerObjectAlignment;
-    }
-    else {
-        *size = sizeof(uint64_t);
-        *alignment = sizeof(uint64_t);
-    }
-}
-
-void
-InitializeSamplerObject(void* userData, uint64_t offset, bool unnormalize,
-    uint8_t fltr, uint8_t addrU, uint8_t addrV, uint8_t addrW)
-{
-    assert((addrU == addrV && addrV == addrW) && "GSL supports single address mode");
-    hsa_ext_sampler_filter_mode_t filter =
-        static_cast<hsa_ext_sampler_filter_mode_t>(fltr);
-    hsa_ext_sampler_addressing_mode_t boundaryU =
-        static_cast<hsa_ext_sampler_addressing_mode_t>(addrU);
-
-    uint32_t    state = (unnormalize) ?
-        amd::Sampler::StateNormalizedCoordsFalse : amd::Sampler::StateNormalizedCoordsTrue;
-    if (filter == HSA_EXT_SAMPLER_FILTER_MODE_LINEAR) {
-        state |= amd::Sampler::StateFilterNearest;
-    }
-    else if (filter == HSA_EXT_SAMPLER_FILTER_MODE_LINEAR) {
-        state |= amd::Sampler::StateFilterLinear;
-    }
-    switch (boundaryU) {
-    case HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE:
-        state |= amd::Sampler::StateAddressClampToEdge;
-        break;
-    case HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_BORDER:
-        state |= amd::Sampler::StateAddressClamp;
-        break;
-    case HSA_EXT_SAMPLER_ADDRESSING_MODE_REPEAT:
-        state |= amd::Sampler::StateAddressRepeat;
-        break;
-    case HSA_EXT_SAMPLER_ADDRESSING_MODE_MIRRORED_REPEAT:
-        state |= amd::Sampler::StateAddressMirroredRepeat;
-        break;
-    case HSA_EXT_SAMPLER_ADDRESSING_MODE_UNDEFINED:
-    default:
-        break;
-    }
-
-    gpu::HSAILProgram* prog = reinterpret_cast<gpu::HSAILProgram*>(userData);
-    if (prog->dev().settings().hsailDirectSRD_) {
-        char *pCPUbuf = new char[gpu::HsaSamplerObjectSize];
-        if (!pCPUbuf) {
-          assert(false);
-          return;
-        }
-        prog->dev().fillHwSampler(state, pCPUbuf, gpu::HsaSamplerObjectSize);
-        DmaMemoryCopy(userData, offset, pCPUbuf, gpu::HsaSamplerObjectSize);
-        delete pCPUbuf;
-    }
-    else {
-        gpu::Sampler* sampler = new gpu::Sampler(prog->dev());
-        if ((sampler != NULL) && sampler->create(state)) {
-            uint64_t    hwSrd = sampler->hwSrd();
-            DmaMemoryCopy(userData, offset, &hwSrd, sizeof(uint64_t));
-            prog->addSampler(sampler);
-        }
-    }
-    return;
-}
-
 namespace gpu {
 
 bool
@@ -1768,10 +1639,11 @@ HSAILProgram::HSAILProgram(Device& device)
     , llvmBinary_()
     , binaryElf_(NULL)
     , rawBinary_(NULL)
-    , globalStore_(NULL)
     , kernels_(NULL)
     , maxScratchRegs_(0)
     , isNull_(false)
+    , executable_(NULL)
+    , loaderContext_(this)
 {
     memset(&binOpts_, 0, sizeof(binOpts_));
     binOpts_.struct_size = sizeof(binOpts_);
@@ -1786,10 +1658,11 @@ HSAILProgram::HSAILProgram(NullDevice& device)
     , llvmBinary_()
     , binaryElf_(NULL)
     , rawBinary_(NULL)
-    , globalStore_(NULL)
     , kernels_(NULL)
     , maxScratchRegs_(0)
     , isNull_(true)
+    , executable_(NULL)
+    , loaderContext_(this)
 {
     memset(&binOpts_, 0, sizeof(binOpts_));
     binOpts_.struct_size = sizeof(binOpts_);
@@ -1817,7 +1690,9 @@ HSAILProgram::~HSAILProgram()
         }
     }
     releaseClBinary();
-    delete globalStore_;
+    if (executable_ != NULL) {
+        Executable::Destroy(executable_);
+    }
     delete kernels_;
 }
 
@@ -2163,21 +2038,46 @@ HSAILProgram::linkImpl(amd::option::Options* options)
         break;
     }
     case ACL_TYPE_CG:
-        hsaLoad = false;
         break;
     case ACL_TYPE_ISA:
-        hsaLoad = false;
         finalize = false;
         break;
     default:
         buildLog_ += "Error while BRIG Codegen phase: the binary is incomplete \n" ;
         return false;
     }
+    if (finalize) {
+        std::string fin_options(options->origOptionStr + hsailOptions());
+        // Append an option so that we can selectively enable a SCOption on CZ
+        // whenever IOMMUv2 is enabled.
+        if (dev().settings().svmFineGrainSystem_) {
+            fin_options.append(" -sc-xnack-iommu");
+        }
+        errorCode = aclCompile(dev().hsaCompiler(), binaryElf_,
+            fin_options.c_str(), ACL_TYPE_CG, ACL_TYPE_ISA, NULL);
+        buildLog_ += aclGetCompilerLog(dev().hsaCompiler());
+        if (errorCode != ACL_SUCCESS) {
+            LogError("Failed to finalize");
+            return false;
+        }
+    }
     // ACL_TYPE_CG stage is not performed for offline compilation
+    hsa_agent_t agent;
+    agent.handle = 1;
     if (!isNull() && hsaLoad) {
-        if (!_aclHsaLoader(dev().hsaCompiler(), binaryElf_, this, &AllocateGPUMemory,
-            &DmaMemoryCopy, &GetSamplerObjectParams, &InitializeSamplerObject)) {
-            buildLog_ += "Error while BRIG Codegen phase: loading BRIG globals in the ELF \n";
+        executable_ = Executable::Create(HSA_PROFILE_BASE, &loaderContext_, NULL);
+        if (executable_ == NULL) {
+            return false;
+        }
+        size_t size = 0;
+        hsa_code_object_t code_object;
+        code_object.handle = reinterpret_cast<uint64_t>(aclExtractSection(dev().hsaCompiler(), binaryElf_, &size, aclTEXT, &errorCode));
+        if (errorCode != ACL_SUCCESS) {
+            return false;
+        }
+        hsa_status_t status = executable_->LoadCodeObject(agent, code_object, NULL);
+        if (status != HSA_STATUS_SUCCESS) {
+            buildLog_ += "Error while HSA Loader phase: loading HSA Code Object \n";
             return false;
         }
     }
@@ -2187,7 +2087,7 @@ HSAILProgram::linkImpl(amd::option::Options* options)
         buildLog_ += "Error while Finalization phase: kernel names query from the ELF failed\n";
         return false;
     }
-    if (kernelNamesSize > 0) {
+    if (!isNull() && kernelNamesSize > 0) {
         char* kernelNames = new char[kernelNamesSize];
         errorCode = aclQueryInfo(dev().hsaCompiler(), binaryElf_, RT_KERNEL_NAMES, NULL, kernelNames, &kernelNamesSize);
         if (errorCode != ACL_SUCCESS) {
@@ -2202,12 +2102,18 @@ HSAILProgram::linkImpl(amd::option::Options* options)
         for (it; it != vKernels.end(); ++it) {
             std::string kernelName = *it;
             HSAILKernel *aKernel = new HSAILKernel(kernelName, this, options->origOptionStr + hsailOptions());
-            if (!aKernel->init(finalize)) {
+            kernels()[kernelName] = aKernel;
+            amd::hsa::loader::Symbol *sym = executable_->GetSymbol("", Kernel::openclMangledName(kernelName).c_str(), agent, 0);
+            if (!sym) {
+                LogError("Failed to get kernel ISA code");
+                return false;
+            }
+            if (!aKernel->init(sym, false)) {
+                LogError("Failed to init HSAILKernel");
                 return false;
             }
             buildLog_ += aKernel->buildLog();
             aKernel->setUniformWorkGroupSize(options->oVariables->UniformWorkGroupSize);
-            kernels()[kernelName] = aKernel;
             dynamicParallelism |= aKernel->dynamicParallelism();
             // Find max scratch regs used in the program. It's used for scratch buffer preallocation
             // with dynamic parallelism, since runtime doesn't know which child kernel will be called
@@ -2333,5 +2239,208 @@ HSAILProgram::info(const char * str) {
     return info_;
 }
 
+hsa_isa_t ORCAHSALoaderContext::IsaFromName(const char *name) {
+    hsa_isa_t isa = {0};
+    if (!strcmp(Gfx700, name)) { isa.handle = gfx700; return isa; }
+    if (!strcmp(Gfx701, name)) { isa.handle = gfx701; return isa; }
+    if (!strcmp(Gfx800, name)) { isa.handle = gfx800; return isa; }
+    if (!strcmp(Gfx801, name)) { isa.handle = gfx801; return isa; }
+    if (!strcmp(Gfx810, name)) { isa.handle = gfx810; return isa; }
+    if (!strcmp(Gfx900, name)) { isa.handle = gfx900; return isa; }
+    return isa;
+}
+
+bool ORCAHSALoaderContext::IsaSupportedByAgent(hsa_agent_t agent, hsa_isa_t isa) {
+    switch (program_->dev().hwInfo()->gfxipVersion_) {
+    default:
+        LogError("Unsupported gfxip version");
+        return false;
+    case gfx700:
+    case gfx701:
+    case gfx702:
+        // gfx701 only differs from gfx700 by faster fp operations and can be loaded on either device.
+        return isa.handle == gfx700 || isa.handle == gfx701;
+    case gfx800:
+        if (ED_ATI_CAL_MACHINE_ICELAND_ISA == program_->dev().hwInfo()->machine_ ||
+            ED_ATI_CAL_MACHINE_TONGA_ISA == program_->dev().hwInfo()->machine_ ) {
+            return isa.handle == gfx800;
+        } else {
+            // gfx800 has only sgrps limited and can be loaded on later chips.
+            return isa.handle == gfx800 || isa.handle == gfx801;
+        }
+    case gfx900:
+        return isa.handle == gfx900;
+    }
+}
+
+void* ORCAHSALoaderContext::SegmentAlloc(amdgpu_hsa_elf_segment_t segment,
+    hsa_agent_t agent, size_t size, size_t align, bool zero) {
+    assert(size);
+    assert(align);
+    switch (segment) {
+    case AMDGPU_HSA_SEGMENT_GLOBAL_PROGRAM:
+    case AMDGPU_HSA_SEGMENT_GLOBAL_AGENT:
+    case AMDGPU_HSA_SEGMENT_READONLY_AGENT:
+        return AgentGlobalAlloc(agent, size, align, zero);
+    case AMDGPU_HSA_SEGMENT_CODE_AGENT:
+        return KernelCodeAlloc(agent, size, align, zero);
+    default:
+        assert(false); return 0;
+    }
+}
+
+bool ORCAHSALoaderContext::SegmentCopy(amdgpu_hsa_elf_segment_t segment,
+    hsa_agent_t agent, void* dst, size_t offset, const void* src, size_t size) {
+    switch (segment) {
+    case AMDGPU_HSA_SEGMENT_GLOBAL_PROGRAM:
+    case AMDGPU_HSA_SEGMENT_GLOBAL_AGENT:
+    case AMDGPU_HSA_SEGMENT_READONLY_AGENT:
+      return AgentGlobalCopy(dst, offset, src, size);
+    case AMDGPU_HSA_SEGMENT_CODE_AGENT:
+      return KernelCodeCopy(dst, offset, src, size);
+    default:
+      assert(false); return false;
+    }
+}
+
+void ORCAHSALoaderContext::SegmentFree(amdgpu_hsa_elf_segment_t segment,
+    hsa_agent_t agent, void* seg, size_t size) {
+    switch (segment) {
+    case AMDGPU_HSA_SEGMENT_GLOBAL_PROGRAM:
+    case AMDGPU_HSA_SEGMENT_GLOBAL_AGENT:
+    case AMDGPU_HSA_SEGMENT_READONLY_AGENT: AgentGlobalFree(seg, size); break;
+    case AMDGPU_HSA_SEGMENT_CODE_AGENT: KernelCodeFree(seg, size); break;
+    default:
+        assert(false); return;
+    }
+}
+
+void* ORCAHSALoaderContext::SegmentAddress(amdgpu_hsa_elf_segment_t segment,
+    hsa_agent_t agent, void* seg, size_t offset) {
+    assert(seg);
+    switch (segment) {
+    case AMDGPU_HSA_SEGMENT_GLOBAL_PROGRAM:
+    case AMDGPU_HSA_SEGMENT_GLOBAL_AGENT:
+    case AMDGPU_HSA_SEGMENT_READONLY_AGENT: {
+        gpu::Memory *gpuMem = reinterpret_cast<gpu::Memory*>(seg);
+        return reinterpret_cast<void*>(gpuMem->vmAddress());
+    }
+    case AMDGPU_HSA_SEGMENT_CODE_AGENT: return (char*) seg + offset;
+    default:
+        assert(false); return NULL;
+    }
+}
+
+hsa_status_t ORCAHSALoaderContext::SamplerCreate(
+    hsa_agent_t agent,
+    const hsa_ext_sampler_descriptor_t *sampler_descriptor,
+    hsa_ext_sampler_t *sampler_handle) {
+    if (!agent.handle) {
+        return HSA_STATUS_ERROR_INVALID_AGENT;
+    }
+    if (!sampler_descriptor || !sampler_handle) {
+        return HSA_STATUS_ERROR_INVALID_ARGUMENT;
+    }
+    uint32_t state = 0;
+    switch (sampler_descriptor->coordinate_mode) {
+        case HSA_EXT_SAMPLER_COORDINATE_MODE_UNNORMALIZED: state = amd::Sampler::StateNormalizedCoordsFalse; break;
+        case HSA_EXT_SAMPLER_COORDINATE_MODE_NORMALIZED:   state = amd::Sampler::StateNormalizedCoordsTrue; break;
+        default:
+            assert(false);
+            return HSA_STATUS_ERROR_INVALID_ARGUMENT;
+    }
+    switch (sampler_descriptor->filter_mode) {
+        case HSA_EXT_SAMPLER_FILTER_MODE_NEAREST: state |= amd::Sampler::StateFilterNearest; break;
+        case HSA_EXT_SAMPLER_FILTER_MODE_LINEAR:  state |= amd::Sampler::StateFilterLinear; break;
+        default:
+            assert(false);
+            return HSA_STATUS_ERROR_INVALID_ARGUMENT;
+
+    }
+    switch (sampler_descriptor->address_mode) {
+        case HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE:   state |= amd::Sampler::StateAddressClampToEdge; break;
+        case HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_BORDER: state |= amd::Sampler::StateAddressClamp; break;
+        case HSA_EXT_SAMPLER_ADDRESSING_MODE_REPEAT:          state |= amd::Sampler::StateAddressRepeat; break;
+        case HSA_EXT_SAMPLER_ADDRESSING_MODE_MIRRORED_REPEAT: state |= amd::Sampler::StateAddressMirroredRepeat; break;
+        case HSA_EXT_SAMPLER_ADDRESSING_MODE_UNDEFINED:
+        default:
+            assert(false);
+            return HSA_STATUS_ERROR_INVALID_ARGUMENT;
+    }
+    assert(!program_->dev().settings().hsailDirectSRD_);
+    gpu::Sampler* sampler = new gpu::Sampler(program_->dev());
+    if (!sampler || !sampler->create(state)) {
+        delete sampler;
+        return HSA_STATUS_ERROR;
+    }
+    program_->addSampler(sampler);
+    sampler_handle->handle = sampler->hwSrd();
+    return HSA_STATUS_SUCCESS;
+}
+
+hsa_status_t ORCAHSALoaderContext::SamplerDestroy(
+    hsa_agent_t agent, hsa_ext_sampler_t sampler_handle) {
+    if (!agent.handle) {
+        return HSA_STATUS_ERROR_INVALID_AGENT;
+    }
+    if (!sampler_handle.handle) {
+        return HSA_STATUS_ERROR_INVALID_ARGUMENT;
+    }
+    return HSA_STATUS_SUCCESS;
+}
+
+void* ORCAHSALoaderContext::CpuMemAlloc(size_t size, size_t align, bool zero) {
+    assert(size);
+    assert(align);
+    assert(sizeof(void*) == 8 || sizeof(void*) == 4);
+    void* ptr = amd::Os::alignedMalloc(size, align);
+    if (zero) {
+        memset(ptr, 0, size);
+    }
+    return ptr;
+}
+
+bool ORCAHSALoaderContext::CpuMemCopy(void *dst, size_t offset, const void* src, size_t size) {
+  if (!dst || !src || dst == src) {
+      return false;
+  }
+  if (0 == size) {
+      return true;
+  }
+  amd::Os::fastMemcpy((char*)dst + offset, src, size);
+  return true;
+}
+
+void* ORCAHSALoaderContext::GpuMemAlloc(size_t size, size_t align, bool zero) {
+    assert(size);
+    assert(align);
+    assert(sizeof(void*) == 8 || sizeof(void*) == 4);
+    gpu::Memory* mem = new gpu::Memory(program_->dev(), amd::alignUp(size, align));
+    if (!mem || !mem->create(gpu::Resource::Local)) {
+        delete mem;
+        return NULL;
+    }
+    assert(program_->dev().xferQueue());
+    if (zero) {
+        char pattern = 0;
+        program_->dev().xferMgr().fillBuffer(*mem, &pattern, sizeof(pattern), amd::Coord3D(0), amd::Coord3D(size));
+    }
+    program_->addGlobalStore(mem);
+    program_->setGlobalVariableTotalSize(program_->globalVariableTotalSize() + size);
+    return mem;
+}
+
+bool ORCAHSALoaderContext::GpuMemCopy(void *dst, size_t offset, const void *src, size_t size) {
+    if (!dst || !src || dst == src) {
+        return false;
+    }
+    if (0 == size) {
+        return true;
+    }
+    assert(program_->dev().xferQueue());
+    gpu::Memory* mem = reinterpret_cast<gpu::Memory*>(dst);
+    return program_->dev().xferMgr().writeBuffer(src, *mem, amd::Coord3D(offset), amd::Coord3D(size), true);
+    return true;
+}
 
 } // namespace gpu
diff --git a/projects/clr/rocclr/runtime/device/gpu/gpuprogram.hpp b/projects/clr/rocclr/runtime/device/gpu/gpuprogram.hpp
index 8945c5383f..916c293b06 100644
--- a/projects/clr/rocclr/runtime/device/gpu/gpuprogram.hpp
+++ b/projects/clr/rocclr/runtime/device/gpu/gpuprogram.hpp
@@ -7,11 +7,18 @@
 
 #include "device/gpu/gpukernel.hpp"
 #include "device/gpu/gpubinary.hpp"
+#include "amd_hsa_loader.hpp"
 
 namespace amd {
 namespace option {
 class Options;
 } // option
+namespace hsa {
+namespace loader {
+class Executable;
+class Context;
+} // loader
+} // hsa
 } // amd
 
 //! \namespace gpu GPU Device Implementation
@@ -369,6 +376,121 @@ private:
     gpu::Memory*    glbData_;   //!< Global data store
 };
 
+using namespace amd::hsa::loader;
+class HSAILProgram;
+
+class ORCAHSALoaderContext final: public Context {
+public:
+    ORCAHSALoaderContext(HSAILProgram* program): program_(program) {}
+
+    virtual ~ORCAHSALoaderContext() {}
+
+    hsa_isa_t IsaFromName(const char *name) override;
+
+    bool IsaSupportedByAgent(hsa_agent_t agent, hsa_isa_t isa) override;
+
+    void* SegmentAlloc(amdgpu_hsa_elf_segment_t segment,
+        hsa_agent_t agent, size_t size, size_t align, bool zero) override;
+
+    bool SegmentCopy(amdgpu_hsa_elf_segment_t segment,
+        hsa_agent_t agent, void* dst, size_t offset,
+        const void* src, size_t size) override;
+
+    void SegmentFree(amdgpu_hsa_elf_segment_t segment,
+        hsa_agent_t agent, void* seg, size_t size = 0) override;
+
+    void* SegmentAddress(amdgpu_hsa_elf_segment_t segment,
+        hsa_agent_t agent, void* seg, size_t offset) override;
+
+    bool ImageExtensionSupported() override { return false; }
+
+    hsa_status_t ImageCreate(
+        hsa_agent_t agent,
+        hsa_access_permission_t image_permission,
+        const hsa_ext_image_descriptor_t *image_descriptor,
+        const void *image_data,
+        hsa_ext_image_t *image_handle) override {
+        // not supported
+        assert(false);
+        return HSA_STATUS_ERROR;
+    }
+
+    hsa_status_t ImageDestroy(
+        hsa_agent_t agent, hsa_ext_image_t image_handle) override {
+        // not supported
+        assert(false);
+        return HSA_STATUS_ERROR;
+    }
+
+    hsa_status_t SamplerCreate(
+        hsa_agent_t agent,
+        const hsa_ext_sampler_descriptor_t *sampler_descriptor,
+        hsa_ext_sampler_t *sampler_handle) override;
+
+    //! All samplers are owned by HSAILProgram and are deleted in its destructor.
+    hsa_status_t SamplerDestroy(
+        hsa_agent_t agent, hsa_ext_sampler_t sampler_handle) override;
+
+private:
+
+    void* AgentGlobalAlloc(
+        hsa_agent_t agent, size_t size, size_t align, bool zero) {
+        return GpuMemAlloc(size, align, zero);
+    }
+
+    bool AgentGlobalCopy(void *dst, size_t offset, const void *src, size_t size) {
+        return GpuMemCopy(dst, offset, src, size);
+    }
+
+    void AgentGlobalFree(void *ptr, size_t size) {
+        GpuMemFree(ptr, size);
+    }
+
+    void* KernelCodeAlloc(
+        hsa_agent_t agent, size_t size, size_t align, bool zero) {
+        return CpuMemAlloc(size, align, zero);
+    }
+
+    bool KernelCodeCopy(void *dst, size_t offset, const void *src, size_t size) {
+        return CpuMemCopy(dst, offset, src, size);
+    }
+
+    void KernelCodeFree(void *ptr, size_t size) {
+        CpuMemFree(ptr, size);
+    }
+
+    void* CpuMemAlloc(size_t size, size_t align, bool zero);
+
+    bool CpuMemCopy(void *dst, size_t offset, const void* src, size_t size);
+
+    void CpuMemFree(void *ptr, size_t size) {
+        amd::Os::alignedFree(ptr);
+    }
+
+    void* GpuMemAlloc(size_t size, size_t align, bool zero);
+
+    bool GpuMemCopy(void *dst, size_t offset, const void *src, size_t size);
+
+    void GpuMemFree(void *ptr, size_t size = 0) {
+        delete reinterpret_cast<gpu::Memory*>(ptr);
+    }
+
+    ORCAHSALoaderContext(const ORCAHSALoaderContext &c);
+
+    ORCAHSALoaderContext& operator=(const ORCAHSALoaderContext &c);
+
+    enum gfx_handle {
+        gfx700 = 700,
+        gfx701 = 701,
+        gfx702 = 702,
+        gfx800 = 800,
+        gfx801 = 801,
+        gfx810 = 810,
+        gfx900 = 900
+    };
+
+    gpu::HSAILProgram* program_;
+};
 
 //! \class HSAIL program
 class HSAILProgram : public device::Program
@@ -385,9 +507,9 @@ public:
     aclBinary* binaryElf() const {
         return static_cast<aclBinary*>(binaryElf_); }
 
-    void setGlobalStore(Memory* mem) { globalStore_ = mem; }
+    void addGlobalStore(Memory* mem) { globalStores_.push_back(mem); }
 
-    const Memory* globalStore() const { return globalStore_; }
+    const std::vector<Memory*>& globalStores() const { return globalStores_; }
 
     //! Return a typecasted GPU device
     gpu::Device& dev()
@@ -497,11 +619,13 @@ private:
     aclBinary*      binaryElf_;     //!< Binary for the new compiler library
     void*           rawBinary_;     //!< Pointer to the raw binary
     aclBinaryOptions binOpts_;      //!< Binary options to create aclBinary
-    Memory*         globalStore_;   //!< Global memory for the program
+    std::vector<Memory*>         globalStores_;   //!< Global memory for the program
     Memory*         kernels_;       //!< Table with kernel object pointers
     uint    maxScratchRegs_;    //!< Maximum number of scratch regs used in the program by individual kernel
     std::list<Sampler*>   staticSamplers_;    //!< List od internal static samplers
     bool            isNull_;        //!< Null program no memory allocations
+    amd::hsa::loader::Executable* executable_;    //!< Executable for HSA Loader
+    ORCAHSALoaderContext loaderContext_;    //!< Context for HSA Loader
 };
 
 /*@}*/} // namespace gpu
diff --git a/projects/clr/rocclr/runtime/device/gpu/gpuscsi.cpp b/projects/clr/rocclr/runtime/device/gpu/gpuscsi.cpp
index 5f5e548b84..a8e5524fa1 100644
--- a/projects/clr/rocclr/runtime/device/gpu/gpuscsi.cpp
+++ b/projects/clr/rocclr/runtime/device/gpu/gpuscsi.cpp
@@ -14,6 +14,7 @@
 #include <sstream>
 #include <iostream>
 #include <ctime>
+#include "amd_hsa_loader.hpp"
 
 namespace gpu {
 
@@ -137,54 +138,36 @@ NullKernel::siCreateHwInfo(const void* shader, AMUabiAddEncoding& encoding)
 }
 
 bool
-HSAILKernel::aqlCreateHWInfo(const void* shader, size_t shaderSize)
+HSAILKernel::aqlCreateHWInfo(amd::hsa::loader::Symbol *sym)
 {
-    // Copy the shader_isa into a buffer
-    hwMetaData_ = new char[shaderSize];
-    if (hwMetaData_ == NULL) {
+    if (!sym) {
         return false;
     }
-    memcpy(hwMetaData_, shader, shaderSize);
-
-    SC_SI_HWSHADER_CS* siMetaData = reinterpret_cast<SC_SI_HWSHADER_CS*>(hwMetaData_);
-
-    // Code to patch the pointers in the shader object.
-    // Must be preferably done in the compiler library
-    size_t offset = siMetaData->common.uSizeInBytes;
-    if (siMetaData->common.u32PvtDataSizeInBytes > 0) {
-        siMetaData->common.pPvtData =
-            reinterpret_cast<SC_BYTE *>(
-            reinterpret_cast<char *>(siMetaData) + offset);
-        offset += siMetaData->common.u32PvtDataSizeInBytes;
+    uint64_t akc_addr = 0;
+    if (!sym->GetInfo(HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, reinterpret_cast<void*>(&akc_addr))) {
+        return false;
     }
-    if (siMetaData->common.codeLenInByte > 0) {
-        siMetaData->common.hShaderMemHandle =
-            reinterpret_cast<char *>(siMetaData) + offset;
-        offset += siMetaData->common.codeLenInByte;
+    amd_kernel_code_t *akc = reinterpret_cast<amd_kernel_code_t*>(akc_addr);
+    cpuAqlCode_ = akc;
+    if (!sym->GetInfo(HSA_EXT_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT_SIZE, reinterpret_cast<void*>(&codeSize_))) {
+        return false;
     }
-
-    char* headerBaseAddress =
-        reinterpret_cast<char*>(siMetaData->common.hShaderMemHandle);
-    amd_kernel_code_t* akc = reinterpret_cast<amd_kernel_code_t*>(
-        headerBaseAddress);
-
-    address codeStartAddress = reinterpret_cast<address>(akc);
-    address codeEndAddress = reinterpret_cast<address>(akc) + siMetaData->common.codeLenInByte;
-    codeSize_ = codeEndAddress - codeStartAddress;
-    code_ = new gpu::Memory(dev(), amd::alignUp(codeSize_, gpu::ConstBuffer::VectorSize));
-
+    size_t akc_align = 0;
+    if (!sym->GetInfo(HSA_EXT_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT_ALIGN, reinterpret_cast<void*>(&akc_align))) {
+        return false;
+    }
+    code_ = new gpu::Memory(dev(), amd::alignUp(codeSize_, akc_align));
     // Initialize kernel ISA code
-    if ((code_ != NULL) && code_->create(Resource::Shader)) {
+    if (code_ && code_->create(Resource::Shader)) {
         address cpuCodePtr = static_cast<address>(code_->map(NULL, Resource::WriteOnly));
         // Copy only amd_kernel_code_t
-        memcpy(cpuCodePtr, codeStartAddress, codeSize_);
+        memcpy(cpuCodePtr,  reinterpret_cast<address>(akc), codeSize_);
         code_->unmap(NULL);
     }
     else {
         LogError("Failed to allocate ISA code!");
         return false;
     }
-    cpuAqlCode_ = akc;
 
     assert((akc->workitem_private_segment_byte_size & 3) == 0 &&
         "Scratch must be DWORD aligned");
diff --git a/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.cpp b/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.cpp
index 13f8d8495a..3ee22c2da4 100644
--- a/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.cpp
+++ b/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.cpp
@@ -3293,10 +3293,10 @@ VirtualGPU::processMemObjectsHSA(
         }
     }
 
-    if (hsaKernel.prog().globalStore() != NULL) {
+    for (gpu::Memory* mem : hsaKernel.prog().globalStores()) {
         const static bool IsReadOnly = false;
         // Validate global store for a dependency in the queue
-        memoryDependency().validate(*this, hsaKernel.prog().globalStore(), IsReadOnly);
+        memoryDependency().validate(*this, mem, IsReadOnly);
     }
 }