diff --git a/rocclr/runtime/device/cpu/cpusettings.cpp b/rocclr/runtime/device/cpu/cpusettings.cpp
index 14d43cdbe6..b7dfdf57de 100644
--- a/rocclr/runtime/device/cpu/cpusettings.cpp
+++ b/rocclr/runtime/device/cpu/cpusettings.cpp
@@ -10,8 +10,6 @@ namespace cpu {
 bool
 Settings::create()
 {
-    largeHostMemAlloc_ = true;
-
     // This code is temporary until cl_khr_fp64 is unconditional
     if (flagIsDefault(CL_KHR_FP64) || CL_KHR_FP64) {
         enableExtension(ClKhrFp64);
diff --git a/rocclr/runtime/device/device.cpp b/rocclr/runtime/device/device.cpp
index 4e1ef8a2e8..d9c7a1a2b2 100644
--- a/rocclr/runtime/device/device.cpp
+++ b/rocclr/runtime/device/device.cpp
@@ -517,7 +517,6 @@ Settings::Settings()
     extensions_          = 0;
     partialDispatch_     = false;
     supportRA_           = true;
-    largeHostMemAlloc_   = false;
     customHostAllocator_ = false;
     waitCommand_         = AMD_OCL_WAIT_COMMAND;
     supportDepthsRGB_    = false;
diff --git a/rocclr/runtime/device/device.hpp b/rocclr/runtime/device/device.hpp
index bb9d16b368..d2a313b6be 100644
--- a/rocclr/runtime/device/device.hpp
+++ b/rocclr/runtime/device/device.hpp
@@ -577,13 +577,12 @@ public:
         struct {
             uint    partialDispatch_: 1;    //!< Enables partial dispatch
             uint    supportRA_: 1;          //!< Support RA channel order format
-            uint    largeHostMemAlloc_: 1;  //!< Allow large host mem allocations (> maxSingleAlloc)
             uint    waitCommand_: 1;        //!< Enables a wait for every submitted command
             uint    customHostAllocator_: 1;//!< True if device has custom host allocator
                                             //  that replaces generic OS allocation routines
             uint    supportDepthsRGB_: 1;   //!< Support DEPTH and sRGB channel order format
             uint    enableHwDebug_: 1;      //!< Enable HW debug support
-            uint    reserved_: 25;
+            uint    reserved_: 26;
         };
         uint    value_;
     };
diff --git a/rocclr/runtime/device/gpu/gpublit.cpp b/rocclr/runtime/device/gpu/gpublit.cpp
index e201985e09..ab85396b93 100644
--- a/rocclr/runtime/device/gpu/gpublit.cpp
+++ b/rocclr/runtime/device/gpu/gpublit.cpp
@@ -1955,20 +1955,9 @@ KernelBlitManager::copyBufferRect(
 
     // Fall into the CAL path for rejected transfers
     if (setup_.disableCopyBufferRect_ ||
-        (gpuMem(srcMemory).isHostMemDirectAccess() || gpuMem(dstMemory).isHostMemDirectAccess()) ||
-        (!dev().heap()->isVirtual() &&
-         ((gpuMem(dstMemory).hb() == NULL) || (gpuMem(srcMemory).hb() == NULL)))) {
-        // Copy data with CAL (no VM mode only)
-        if (gpuMem(srcMemory).isHostMemDirectAccess() || gpuMem(dstMemory).isHostMemDirectAccess()) {
-            result = DmaBlitManager::copyBufferRect(srcMemory, dstMemory,
-                srcRectIn, dstRectIn, sizeIn, entire);
-        }
-
-        if ((!dev().heap()->isVirtual() && ((gpuMem(dstMemory).hb() == NULL) || (gpuMem(srcMemory).hb() == NULL)))
-                && !result) {
-            result = HostBlitManager::copyBufferRect(srcMemory, dstMemory,
-                srcRectIn, dstRectIn, sizeIn, entire);
-        }
+        gpuMem(srcMemory).isHostMemDirectAccess() || gpuMem(dstMemory).isHostMemDirectAccess()) {
+        result = DmaBlitManager::copyBufferRect(srcMemory, dstMemory,
+            srcRectIn, dstRectIn, sizeIn, entire);
 
         if (result) {
             synchronize();
@@ -2395,11 +2384,9 @@ KernelBlitManager::copyBuffer(
 {
     amd::ScopedLock k(lockXferOps_);
     bool    result = false;
-    bool    forceCal = !dev().heap()->isVirtual() &&
-        ((gpuMem(srcMemory).hb() == NULL) || (gpuMem(dstMemory).hb() == NULL));
 
-    if ((!forceCal && !gpuMem(srcMemory).isHostMemDirectAccess() &&
-         !gpuMem(dstMemory).isHostMemDirectAccess())) {
+    if (!gpuMem(srcMemory).isHostMemDirectAccess() &&
+        !gpuMem(dstMemory).isHostMemDirectAccess()) {
         uint    blitType = BlitCopyBuffer;
         size_t  dim = 1;
         size_t  globalWorkOffset[3] = { 0, 0, 0 };
@@ -2489,7 +2476,6 @@ KernelBlitManager::copyBuffer(
         result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], parameters);
     }
     else {
-        // Copy data with CAL (no VM mode only)
         result = DmaBlitManager::copyBuffer(
             srcMemory, dstMemory, srcOrigin, dstOrigin, sizeIn, entire);
     }
diff --git a/rocclr/runtime/device/gpu/gpudevice.cpp b/rocclr/runtime/device/gpu/gpudevice.cpp
index 32bd622b01..e8775b6db7 100644
--- a/rocclr/runtime/device/gpu/gpudevice.cpp
+++ b/rocclr/runtime/device/gpu/gpudevice.cpp
@@ -173,7 +173,7 @@ NullDevice::create(CALtarget target)
     calAttr.localRAM = 512;
 
     // Fill the device info structure
-    fillDeviceInfo(calAttr, memInfo, 4096, 1, true);
+    fillDeviceInfo(calAttr, memInfo, 4096, 1);
 
     if (settings().hsail_ || (settings().oclVersion_ == OpenCL20)) {
         // Runtime doesn't know what local size could be on the real board
@@ -225,9 +225,7 @@ void NullDevice::fillDeviceInfo(
     const CALdeviceattribs& calAttr,
     const gslMemInfo& memInfo,
     size_t  maxTextureSize,
-    uint    numComputeRings,
-    bool    isVirtualMode
-    )
+    uint    numComputeRings)
 {
     info_.type_     = CL_DEVICE_TYPE_GPU;
     info_.vendorId_ = 0x1002;
@@ -276,56 +274,45 @@ void NullDevice::fillDeviceInfo(
         info_.globalMemCacheType_   = CL_NONE;
     }
 
-    if (isVirtualMode) {
 #if defined(ATI_OS_LINUX)
-        info_.globalMemSize_   =
-            (static_cast<cl_ulong>(std::min(GPU_MAX_HEAP_SIZE, 100u)) *
-            // globalMemSize is the actual available size for app on Linux
-            // Because Linux base driver doesn't support paging
-            static_cast<cl_ulong>(memInfo.cardMemAvailableBytes + memInfo.cardExtMemAvailableBytes) / 100u);
+    info_.globalMemSize_   =
+        (static_cast<cl_ulong>(std::min(GPU_MAX_HEAP_SIZE, 100u)) *
+        // globalMemSize is the actual available size for app on Linux
+        // Because Linux base driver doesn't support paging
+        static_cast<cl_ulong>(memInfo.cardMemAvailableBytes + memInfo.cardExtMemAvailableBytes) / 100u);
 #else
-        info_.globalMemSize_   =
-            (static_cast<cl_ulong>(std::min(GPU_MAX_HEAP_SIZE, 100u)) *
-            static_cast<cl_ulong>(calAttr.localRAM) / 100u) * Mi;
+    info_.globalMemSize_   =
+        (static_cast<cl_ulong>(std::min(GPU_MAX_HEAP_SIZE, 100u)) *
+        static_cast<cl_ulong>(calAttr.localRAM) / 100u) * Mi;
 #endif
-        if (settings().apuSystem_) {
-            info_.globalMemSize_   +=
-                (static_cast<cl_ulong>(calAttr.uncachedRemoteRAM) * Mi * 75)/100;
-        }
+    if (settings().apuSystem_) {
+        info_.globalMemSize_   +=
+            (static_cast<cl_ulong>(calAttr.uncachedRemoteRAM) * Mi * 75)/100;
+    }
 
-        // We try to calculate the largest available memory size from
-        // the largest available block in either heap.  In theory this
-        // should be the size we can actually allocate at application
-        // start.  Note that it may not be a guarantee still as the
-        // application progresses.
-        info_.maxMemAllocSize_ = std::max(
-            cl_ulong(memInfo.cardLargestFreeBlockBytes),
-            cl_ulong(memInfo.cardExtLargestFreeBlockBytes));
+    // We try to calculate the largest available memory size from
+    // the largest available block in either heap.  In theory this
+    // should be the size we can actually allocate at application
+    // start.  Note that it may not be a guarantee still as the
+    // application progresses.
+    info_.maxMemAllocSize_ = std::max(
+        cl_ulong(memInfo.cardLargestFreeBlockBytes),
+        cl_ulong(memInfo.cardExtLargestFreeBlockBytes));
 
 #if defined(ATI_OS_WIN)
-        if (settings().apuSystem_) {
-            info_.maxMemAllocSize_ = std::max(
-                (static_cast<cl_ulong>(calAttr.uncachedRemoteRAM) * Mi * 75)/100,
-                info_.maxMemAllocSize_);
-        }
+    if (settings().apuSystem_) {
+        info_.maxMemAllocSize_ = std::max(
+            (static_cast<cl_ulong>(calAttr.uncachedRemoteRAM) * Mi * 75)/100,
+            info_.maxMemAllocSize_);
+    }
 #endif
-        info_.maxMemAllocSize_ = cl_ulong(info_.maxMemAllocSize_ *
-            std::min(GPU_SINGLE_ALLOC_PERCENT, 100u) / 100u);
+    info_.maxMemAllocSize_ = cl_ulong(info_.maxMemAllocSize_ *
+        std::min(GPU_SINGLE_ALLOC_PERCENT, 100u) / 100u);
 
-        //! \note Force max single allocation size.
-        //! 4GB limit for the blit kernels and 64 bit optimizations.
-        info_.maxMemAllocSize_ = std::min(info_.maxMemAllocSize_,
-                static_cast<cl_ulong>(settings().maxAllocSize_));
-    }
-    else {
-        uint    maxHeapSize = flagIsDefault(GPU_MAX_HEAP_SIZE) ? 50 : GPU_MAX_HEAP_SIZE;
-        info_.globalMemSize_   = (std::min(maxHeapSize, 100u)
-            * calAttr.localRAM / 100u) * Mi;
-
-        uint    maxAllocSize = flagIsDefault(GPU_SINGLE_ALLOC_PERCENT) ? 25 : GPU_SINGLE_ALLOC_PERCENT;
-        info_.maxMemAllocSize_ = cl_ulong(info_.globalMemSize_ *
-            std::min(maxAllocSize, 100u) / 100u);
-    }
+    //! \note Force max single allocation size.
+    //! 4GB limit for the blit kernels and 64 bit optimizations.
+    info_.maxMemAllocSize_ = std::min(info_.maxMemAllocSize_,
+            static_cast<cl_ulong>(settings().maxAllocSize_));
 
     if (info_.maxMemAllocSize_ < cl_ulong(128 * Mi)) {
         LogError("We are unable to get a heap large enough to support the OpenCL minimum "\
@@ -377,7 +364,7 @@ void NullDevice::fillDeviceInfo(
         info_.imagePitchAlignment_       = 256; // XXX: 256 pixel pitch alignment for now
         info_.imageBaseAddressAlignment_ = 256; // XXX: 256 byte base address alignment for now
 
-        info_.bufferFromImageSupport_ = (isVirtualMode) ? CL_TRUE : CL_FALSE;
+        info_.bufferFromImageSupport_ = CL_TRUE;
     }
 
     info_.errorCorrectionSupport_    = CL_FALSE;
@@ -404,7 +391,7 @@ void NullDevice::fillDeviceInfo(
     ::strcpy(info_.name_, hwInfo()->targetName_);
     ::strcpy(info_.vendor_, "Advanced Micro Devices, Inc.");
     ::snprintf(info_.driverVersion_, sizeof(info_.driverVersion_) - 1,
-         AMD_BUILD_STRING "%s", (isVirtualMode) ? " (VM)": "");
+         AMD_BUILD_STRING "%s", " (VM)");
 
     info_.profile_ = "FULL_PROFILE";
     if (settings().oclVersion_ == OpenCL20) {
@@ -508,6 +495,25 @@ void NullDevice::fillDeviceInfo(
     }
 }
 
+bool
+Device::Heap::create(Device& device)
+{
+    // Create a new GPU resource
+    resource_ = new Resource(device, 0, CM_SURF_FMT_R32I);
+    if (resource_ == NULL) {
+        return false;
+    }
+
+    if (!resource_->create(Resource::Heap)) {
+        return false;
+    }
+
+    if (!device.settings().hsail_) {
+        baseAddress_ = resource_->gslResource()->getSurfaceAddress();
+    }
+    return true;
+}
+
 void
 Device::Engines::create(uint num, gslEngineDescriptor* desc, uint maxNumComputeRings)
 {
@@ -670,7 +676,7 @@ Device::Device()
     , CALGSLDevice()
     , numOfVgpus_(0)
     , context_(NULL)
-    , heap_(NULL)
+    , heap_()
     , dummyPage_(NULL)
     , lockAsyncOps_(NULL)
     , lockAsyncOpsForInitHeap_(NULL)
@@ -731,11 +737,6 @@ Device::~Device()
         dummyPage_->release();
     }
 
-    // Destroy global heap
-    if (heap_ != NULL) {
-        delete heap_;
-    }
-
     // Destroy resource cache
     delete resourceCache_;
 
@@ -837,26 +838,6 @@ Device::create(CALuint ordinal, CALuint numOfDevices)
 
     size_t  resourceCacheSize = settings().resourceCacheSize_;
 
-    // Allocate heap
-    heapSize_ = settings().heapSize_;
-
-    // Check if BE supports virtual addressing mode
-    if (isVmMode()) {
-        heap_ = new VirtualHeap(*this);
-        gpuSettings->largeHostMemAlloc_ = (NULL != heap_) ? true : false;
-    }
-
-    // If virtual heap allocation failed, then try static allocation
-    if (heap_ == NULL) {
-        heap_ = new Heap(*this);
-        // Disable resource cache if VM is disable
-        resourceCacheSize = 0;
-        if (NULL == heap_) {
-            return false;
-        }
-    }
-
-
 #ifdef DEBUG
     std::stringstream  message;
     if (settings().remoteAlloc_) {
@@ -865,10 +846,7 @@ Device::create(CALuint ordinal, CALuint numOfDevices)
     else {
         message << "Using *Local* memory";
     }
-    if (!heap()->isVirtual()) {
-        message << ": " << settings().heapSize_ / Mi << "MB, growth: " <<  \
-            settings().heapSizeGrowth_ / Mi << "MB";
-    }
+
     message << std::endl;
     LogInfo(message.str().c_str());
 #endif // DEBUG
@@ -883,8 +861,7 @@ Device::create(CALuint ordinal, CALuint numOfDevices)
     // Fill the device info structure
     fillDeviceInfo(getAttribs(), getMemInfo(),
         static_cast<size_t>(getMaxTextureSize()),
-        engines().numComputeRings(), heap()->isVirtual()
-    );
+        engines().numComputeRings());
 
     if (settings().hsail_ || (settings().oclVersion_ == OpenCL20)) {
         if (NULL == hsaCompiler_) {
@@ -955,7 +932,7 @@ Device::initializeHeapResources()
         }
 
         // Complete initialization of the heap and other buffers
-        if ((heap_ == NULL) || !heap_->create(heapSize_, settings().remoteAlloc_)) {
+        if (!heap_.create(*this)) {
             LogError("Failed GPU heap creation");
             return false;
         }
@@ -987,7 +964,7 @@ Device::initializeHeapResources()
                     type = Resource::RemoteUSWC;
                 }
                 xferWrite_ = new XferBuffers(*this, type,
-                    amd::alignUp(settings().stagedXferSize_, heap()->granularityB()));
+                    amd::alignUp(settings().stagedXferSize_, 4 * Ki));
                 if ((xferWrite_ == NULL) || !xferWrite_->create()) {
                     LogError("Couldn't allocate transfer buffer objects for read");
                     return false;
@@ -997,7 +974,7 @@ Device::initializeHeapResources()
             // Initialize staged read buffers
             if (settings().stagedXferRead_) {
                 xferRead_ = new XferBuffers(*this, Resource::Remote,
-                    amd::alignUp(settings().stagedXferSize_, heap()->granularityB()));
+                    amd::alignUp(settings().stagedXferSize_, 4 * Ki));
                 if ((xferRead_ == NULL) || !xferRead_->create()) {
                     LogError("Couldn't allocate transfer buffer objects for write");
                     return false;
@@ -1086,52 +1063,6 @@ Device::createVirtualDevice(
     }
 }
 
-bool
-Device::reallocHeap(size_t size, bool remoteAlloc)
-{
-    size_t  heapSize    =  heapSize_ + ((size != 0) ?
-        amd::alignUp(size, settings().heapSizeGrowth_) : 0);
-    Heap*   oldHeap     = heap_;
-    // Maximum heap limit size = reported size + internal memory
-    size_t  maxHeapLimit = static_cast<size_t>(info().globalMemSize_) +
-        // an extra 10MB for the alignments of allocations,
-        // since the conformance test doesn't expect any
-        10 * Mi;
-
-    if ((settings().heapSizeGrowth_ == 0) ||
-        // Allow the heap growth up to the global memory limit
-        (heapSize_ + size > maxHeapLimit)) {
-        return false;
-    }
-    heapSize = std::min(maxHeapLimit, heapSize);
-
-    heap_ = new Heap(*this);
-
-    // Make sure we have allocated a new global heap
-    if (NULL == heap_) {
-        heap_ = oldHeap;
-        return false;
-    }
-
-    if (!heap_->create(heapSize, remoteAlloc)) {
-        delete heap_;
-        heap_ = oldHeap;
-        return false;
-    }
-
-    // Copy the old heap to the new one
-    if (!oldHeap->copyTo(heap_)) {
-        delete heap_;
-        heap_ = oldHeap;
-        return false;
-    }
-
-    delete oldHeap;
-    heapSize_ = heapSize;
-
-    return true;
-}
-
 device::Program*
 Device::createProgram(int oclVer)
 {
@@ -1288,65 +1219,6 @@ Device::tearDown()
     }
 }
 
-//! @note This funciton must be lock protected from a caller
-HeapBlock*
-Device::allocHeapBlock(size_t size) const
-{
-    HeapBlock* hb = NULL;
-
-    // Allocate the underlying heap block
-    hb = heap_->alloc(size);
-
-    // Virtual heap should never fail allocation
-    if ((hb == NULL) && (!heap_->isVirtual())) {
-        // Queues can't process commands,
-        // while the global heap reallocation occurs.
-        // So stall all queues and then reallocate the global heap
-        ScopedLockVgpus lock(*this);
-
-        // Wait for idle
-        for (uint idx = 0; idx < vgpus().size(); ++idx) {
-            vgpus()[idx]->waitAllEngines();
-        }
-
-        // Acount memory alignment for the new allocation
-        size_t  extraSpace = heap_->granularityB();
-        if (size >= heap_->freeSpace()) {
-            // Required extra space = requested size - free space
-            extraSpace += size - heap_->freeSpace();
-        }
-
-        //! @note the const cast here looks bad, but the device object
-        //  is a lock protected above. The rest of the code
-        //  doesn't change the device object.
-        //  So the const methods can be safly used everywhere else.
-        //  In general we should avoid changing the device object after initialization
-
-        // Try to reallocate the heap with the same memory type
-        if (const_cast<Device*>(this)->reallocHeap(extraSpace, settings().remoteAlloc_)) {
-            hb = heap_->alloc(size);
-        }
-
-        if (hb == NULL) {
-            // Use reversed memory type as a temporary storage
-            bool    remoteAlloc = settings().remoteAlloc_ ^ true;
-
-            // Try to reallocate the heap
-            if (const_cast<Device*>(this)->reallocHeap(extraSpace, remoteAlloc)) {
-                // Back to the default location of the global heap
-                remoteAlloc ^= true;
-                if (!const_cast<Device*>(this)->reallocHeap(0, remoteAlloc)) {
-                    LogWarning("New memory type for the \
-                        global heap after reallocation!");
-                }
-                hb = heap_->alloc(size);
-            }
-        }
-    }
-
-    return hb;
-}
-
 gpu::Memory*
 Device::getGpuMemory(amd::Memory* mem) const
 {
@@ -1392,99 +1264,20 @@ Device::createScratchBuffer(size_t size) const
 {
     Memory* gpuMemory = NULL;
 
-    // Use virtual heap allocation
-    if (heap()->isVirtual()) {
-        // Create a memory object
-        gpuMemory = new gpu::Memory(*this, size);
-        if (NULL == gpuMemory || !gpuMemory->create(Resource::Local)) {
-            delete gpuMemory;
-            gpuMemory = NULL;
-        }
-    }
-    else {
-        // We have to lock the heap block allocation,
-        // so possible reallocation won't occur twice or
-        // another thread could destroy a heap block,
-        // while we didn't finish allocation
-        amd::ScopedLock k(lockAsyncOps());
-
-        HeapBlock* hb = allocHeapBlock(size);
-        if (hb != NULL) {
-            // wrap it
-            gpuMemory = new gpu::Memory(*this, *hb);
-
-            // Create resource
-            if (NULL != gpuMemory) {
-                Resource::ViewParams   params;
-                params.offset_  = hb->offset_;
-                params.size_    = hb->size_;
-                params.resource_ = &(globalMem());
-                params.memory_  = NULL;
-                if (!gpuMemory->create(Resource::View, &params)) {
-                    delete gpuMemory;
-                    gpuMemory = NULL;
-                }
-            }
-        }
-    }
-
-    return gpuMemory;
-}
-
-gpu::Memory*
-Device::createBufferFromHeap(amd::Memory& owner) const
-{
-    size_t  size = owner.getSize();
-    gpu::Memory* gpuMemory;
-
-    // We have to lock the heap block allocation,
-    // so possible reallocation won't occur twice or
-    // another thread could destroy a heap block,
-    // while we didn't finish allocation
-    amd::ScopedLock k(lockAsyncOps());
-
-    HeapBlock* hb = allocHeapBlock(size);
-    if (hb == NULL) {
-        LogError("We don't have enough video memory!");
-        return NULL;
-    }
-
     // Create a memory object
-    gpuMemory = new gpu::Memory(*this, owner, hb);
-    if (NULL == gpuMemory) {
-        hb->setMemory(NULL);
-        hb->free();
-        return NULL;
-    }
-
-    Resource::ViewParams params;
-    params.owner_       = &owner;
-    params.offset_      = hb->offset_;
-    params.size_        = hb->size_;
-    params.resource_    = &(globalMem());
-    params.memory_      = NULL;
-
-    if (!gpuMemory->create(Resource::View, &params)) {
+    gpuMemory = new gpu::Memory(*this, size);
+    if (NULL == gpuMemory || !gpuMemory->create(Resource::Local)) {
         delete gpuMemory;
-        return NULL;
+        gpuMemory = NULL;
     }
 
-    // Check if owner is interop memory
-    if (owner.isInterop()) {
-        if (!gpuMemory->createInterop(Memory::InteropHwEmulation)) {
-            LogError("HW interop creation failed!");
-            delete gpuMemory;
-            return NULL;
-        }
-    }
     return gpuMemory;
 }
 
 gpu::Memory*
 Device::createBuffer(
     amd::Memory&    owner,
-    bool            directAccess,
-    bool            bufferAlloc) const
+    bool            directAccess) const
 {
     size_t  size = owner.getSize();
     gpu::Memory* gpuMemory;
@@ -1504,39 +1297,7 @@ Device::createBuffer(
             return NULL;
         }
 
-        if (!heap()->isVirtual()) {
-            bool    uhpAlloc =
-                (owner.parent()->getMemFlags() & CL_MEM_USE_HOST_PTR) ? true : false;
-
-            if (owner.parent()->getType() != CL_MEM_OBJECT_IMAGE1D_BUFFER) {
-                //! \note This extra line is necessary to make sure that subbuffer
-                //! allocation is a synch operation,
-                //! due to a possible realloc of heap(no VM) or parent(UHP)
-                amd::ScopedLock k(lockAsyncOps());
-
-                //! @note: For now make sure the parent is allocated in the global heap
-                //! or if it's the UHP optimization for prepinned memory
-                if (((gpuParent->hb() == NULL) || uhpAlloc) &&
-                    !owner.parent()->reallocedDeviceMemory(this)) {
-                    if (reallocMemory(*owner.parent())) {
-                        gpuParent = getGpuMemory(owner.parent());
-                    }
-                    else {
-                        LogError("Can't reallocate the owner object for subbuffer allocation");
-                        return NULL;
-                    }
-                }
-
-                return gpuParent->createBufferView(owner);
-            }
-            else {
-                gpuParent = getGpuMemory(owner.parent()->parent());
-                return gpuParent->createBufferView(*owner.parent()->parent());
-            }
-        }
-        else {
-            return gpuParent->createBufferView(owner);
-        }
+        return gpuParent->createBufferView(owner);
     }
 
     Resource::MemoryType    type = (owner.forceSysMemAlloc() || (owner.getMemFlags() & CL_MEM_SVM_FINE_GRAIN_BUFFER)) ?
@@ -1550,138 +1311,123 @@ Device::createBuffer(
     }
 
     // Use direct access if it's possible
-    if (bufferAlloc || (type == Resource::Remote)) {
-        bool    forceHeapAlloc = false;
-        bool    remoteAlloc = false;
-        // Internal means VirtualDevice!=NULL
-        bool    internalAlloc = ((owner.getMemFlags() & CL_MEM_USE_HOST_PTR) &&
-              (owner.getVirtualDevice() != NULL)) ? true : false;
+    bool    remoteAlloc = false;
+    // Internal means VirtualDevice!=NULL
+    bool    internalAlloc = ((owner.getMemFlags() & CL_MEM_USE_HOST_PTR) &&
+            (owner.getVirtualDevice() != NULL)) ? true : false;
 
-        // Create a memory object
-        gpuMemory = new gpu::Buffer(*this, owner, owner.getSize());
-        if (NULL == gpuMemory) {
-            return NULL;
-        }
+    // Create a memory object
+    gpuMemory = new gpu::Buffer(*this, owner, owner.getSize());
+    if (NULL == gpuMemory) {
+        return NULL;
+    }
 
-        // Check if owner is interop memory
-        if (owner.isInterop()) {
-            result = gpuMemory->createInterop(Memory::InteropDirectAccess);
-        }
-        else if (owner.getMemFlags() & CL_MEM_USE_PERSISTENT_MEM_AMD) {
-            // Attempt to allocate from persistent heap
-            result = gpuMemory->create(Resource::Persistent);
-        }
-        else if (directAccess || (type == Resource::Remote)) {
-            // Check for system memory allocations
-            if ((owner.getMemFlags() & (CL_MEM_ALLOC_HOST_PTR | CL_MEM_USE_HOST_PTR))
-                || (settings().remoteAlloc_)) {
-                // Allocate remote memory if AHP allocation and context has just 1 device
-                if ((owner.getMemFlags() & CL_MEM_ALLOC_HOST_PTR) &&
-                    (owner.getContext().devices().size() == 1)) {
-                    if (owner.getMemFlags() & (CL_MEM_READ_ONLY |
-                        CL_MEM_HOST_WRITE_ONLY | CL_MEM_HOST_NO_ACCESS)) {
-                        // GPU will be reading from this host memory buffer,
-                        // so assume Host write into it
-                        type = Resource::RemoteUSWC;
-                        remoteAlloc = true;
-                    }
+    // Check if owner is interop memory
+    if (owner.isInterop()) {
+        result = gpuMemory->createInterop(Memory::InteropDirectAccess);
+    }
+    else if (owner.getMemFlags() & CL_MEM_USE_PERSISTENT_MEM_AMD) {
+        // Attempt to allocate from persistent heap
+        result = gpuMemory->create(Resource::Persistent);
+    }
+    else if (directAccess || (type == Resource::Remote)) {
+        // Check for system memory allocations
+        if ((owner.getMemFlags() & (CL_MEM_ALLOC_HOST_PTR | CL_MEM_USE_HOST_PTR))
+            || (settings().remoteAlloc_)) {
+            // Allocate remote memory if AHP allocation and context has just 1 device
+            if ((owner.getMemFlags() & CL_MEM_ALLOC_HOST_PTR) &&
+                (owner.getContext().devices().size() == 1)) {
+                if (owner.getMemFlags() & (CL_MEM_READ_ONLY |
+                    CL_MEM_HOST_WRITE_ONLY | CL_MEM_HOST_NO_ACCESS)) {
+                    // GPU will be reading from this host memory buffer,
+                    // so assume Host write into it
+                    type = Resource::RemoteUSWC;
+                    remoteAlloc = true;
                 }
-                // Make sure owner has a valid hostmem pointer and it's not COPY
-                if (!remoteAlloc && (owner.getHostMem() != NULL)) {
-                    Resource::PinnedParams params;
-                    params.owner_ = &owner;
-                    params.gpu_ =
-                        reinterpret_cast<VirtualGPU*>(owner.getVirtualDevice());
+            }
+            // Make sure owner has a valid hostmem pointer and it's not COPY
+            if (!remoteAlloc && (owner.getHostMem() != NULL)) {
+                Resource::PinnedParams params;
+                params.owner_ = &owner;
+                params.gpu_ =
+                    reinterpret_cast<VirtualGPU*>(owner.getVirtualDevice());
 
-                    params.hostMemRef_  = owner.getHostMemRef();
-                    params.size_        = owner.getHostMemRef()->size();
-                    if (0 == params.size_) {
-                        params.size_ = owner.getSize();
-                    }
-                    // Create memory object
-                    result = gpuMemory->create(Resource::Pinned, &params);
+                params.hostMemRef_  = owner.getHostMemRef();
+                params.size_        = owner.getHostMemRef()->size();
+                if (0 == params.size_) {
+                    params.size_ = owner.getSize();
+                }
+                // Create memory object
+                result = gpuMemory->create(Resource::Pinned, &params);
 
-                    // If direct access failed
-                    if (!result) {
-                        // and VM off, then force a heap allocation
-                        if (!heap()->isVirtual()) {
-                            // Internal pinning doesn't need a heap allocation
-                            if (!internalAlloc) {
-                                forceHeapAlloc = true;
-                            }
-                        }
-                        // Don't use cached allocation
-                        // if size is biger than max single alloc
-                        if (owner.getSize() > info().maxMemAllocSize_) {
-                            delete gpuMemory;
-                            return NULL;
-                        }
+                // If direct access failed
+                if (!result) {
+                    // Don't use cached allocation
+                    // if size is biger than max single alloc
+                    if (owner.getSize() > info().maxMemAllocSize_) {
+                        delete gpuMemory;
+                        return NULL;
                     }
                 }
             }
         }
+    }
 
-        if (!result && !forceHeapAlloc &&
-            // Make sure it's not internal alloc
-            !internalAlloc) {
-            Resource::CreateParams  params;
-            params.owner_ = &owner;
-            params.gpu_ = static_cast<VirtualGPU*>(owner.getVirtualDevice());
+    if (!result &&
+        // Make sure it's not internal alloc
+        !internalAlloc) {
+        Resource::CreateParams  params;
+        params.owner_ = &owner;
+        params.gpu_ = static_cast<VirtualGPU*>(owner.getVirtualDevice());
 
-            // Create memory object
-            result = gpuMemory->create(type, &params);
+        // Create memory object
+        result = gpuMemory->create(type, &params);
 
-            // If allocation was successful
-            if (result) {
-                // Initialize if the memory is a pipe object
-                if (owner.getType() == CL_MEM_OBJECT_PIPE) {
-                    // Pipe initialize in order read_idx, write_idx, end_idx. Refer clk_pipe_t structure.
-                    // Init with 3 DWORDS for 32bit addressing and 6 DWORDS for 64bit
-                    size_t pipeInit[3] = {0 , 0, owner.asPipe()->getMaxNumPackets()};
-                    gpuMemory->writeRawData(*xferQueue_, sizeof(pipeInit), pipeInit, true);
+        // If allocation was successful
+        if (result) {
+            // Initialize if the memory is a pipe object
+            if (owner.getType() == CL_MEM_OBJECT_PIPE) {
+                // Pipe initialize in order read_idx, write_idx, end_idx. Refer clk_pipe_t structure.
+                // Init with 3 DWORDS for 32bit addressing and 6 DWORDS for 64bit
+                size_t pipeInit[3] = {0 , 0, owner.asPipe()->getMaxNumPackets()};
+                gpuMemory->writeRawData(*xferQueue_, sizeof(pipeInit), pipeInit, true);
+            }
+            // If memory has direct access from host, then get CPU address
+            if (gpuMemory->isHostMemDirectAccess() &&
+                (type != Resource::ExternalPhysical)) {
+                void* address = gpuMemory->map(NULL);
+                if (address != NULL) {
+                    // Copy saved memory
+                    if (owner.getMemFlags() & CL_MEM_COPY_HOST_PTR) {
+                        memcpy(address, owner.getHostMem(), owner.getSize());
+                    }
+                    // It should be safe to change the host memory pointer,
+                    // because it's lock protected from the upper caller
+                    owner.setHostMem(address);
                 }
-                // If memory has direct access from host, then get CPU address
-                if (gpuMemory->isHostMemDirectAccess() &&
-                   (type != Resource::ExternalPhysical)) {
-                    void* address = gpuMemory->map(NULL);
-                    if (address != NULL) {
-                        // Copy saved memory
-                        if (owner.getMemFlags() & CL_MEM_COPY_HOST_PTR) {
-                            memcpy(address, owner.getHostMem(), owner.getSize());
-                        }
-                        // It should be safe to change the host memory pointer,
-                        // because it's lock protected from the upper caller
-                        owner.setHostMem(address);
-                    }
-                    else {
-                        result = false;
-                    }
-                }
-                // An optimization for CHP. Copy memory and destroy sysmem allocation
-                else if ((gpuMemory->memoryType() != Resource::Pinned) &&
-                         (owner.getMemFlags() & CL_MEM_COPY_HOST_PTR) &&
-                         (owner.getContext().devices().size() == 1)) {
-                    amd::Coord3D    origin(0, 0, 0);
-                    amd::Coord3D    region(owner.getSize());
-                    static const bool Entire  = true;
-                    if (xferMgr().writeBuffer(owner.getHostMem(),
-                        *gpuMemory, origin, region, Entire)) {
-                        // Clear CHP memory
-                        owner.setHostMem(NULL);
-                    }
+                else {
+                    result = false;
+                }
+            }
+            // An optimization for CHP. Copy memory and destroy sysmem allocation
+            else if ((gpuMemory->memoryType() != Resource::Pinned) &&
+                        (owner.getMemFlags() & CL_MEM_COPY_HOST_PTR) &&
+                        (owner.getContext().devices().size() == 1)) {
+                amd::Coord3D    origin(0, 0, 0);
+                amd::Coord3D    region(owner.getSize());
+                static const bool Entire  = true;
+                if (xferMgr().writeBuffer(owner.getHostMem(),
+                    *gpuMemory, origin, region, Entire)) {
+                    // Clear CHP memory
+                    owner.setHostMem(NULL);
                 }
             }
-        }
-
-        if (!result && !forceHeapAlloc) {
-            delete gpuMemory;
-            return NULL;
         }
     }
 
     if (!result) {
-        assert(!heap()->isVirtual() && "Can't have static heap allocation with VM");
-        gpuMemory = createBufferFromHeap(owner);
+        delete gpuMemory;
+        return NULL;
     }
 
     return gpuMemory;
@@ -1703,10 +1449,10 @@ Device::createImage(amd::Memory& owner, bool directAccess) const
         }
         // Create a view on the specified device
         gpuImage = (gpu::Memory*)createView(owner, *devParent);
-        if (heap()->isVirtual() && (NULL != gpuImage) && (gpuImage->owner() != NULL)) {
+        if ((NULL != gpuImage) && (gpuImage->owner() != NULL)) {
             gpuImage->owner()->setHostMem((address)(owner.parent()->getHostMem()) + gpuImage->owner()->getOrigin());
         }
-        return gpuImage ;
+        return gpuImage;
     }
 
     gpuImage = new gpu::Image(*this, owner,
@@ -1778,11 +1524,11 @@ Device::createImage(amd::Memory& owner, bool directAccess) const
                  (owner.getMemFlags() & CL_MEM_COPY_HOST_PTR) &&
                  (owner.getContext().devices().size() == 1)) {
             // Ignore copy for image1D_buffer, since it was already done for buffer
-            if (heap()->isVirtual() && imageBuffer) {
+            if (imageBuffer) {
                 // Clear CHP memory
                 owner.setHostMem(NULL);
             }
-            else if (!imageBuffer) {
+            else {
                 amd::Coord3D    origin(0, 0, 0);
                 static const bool Entire  = true;
                 if (xferMgr().writeImage(owner.getHostMem(),
@@ -1809,25 +1555,12 @@ Device::createMemory(
     amd::Memory&    owner) const
 {
     bool directAccess   = false;
-    bool bufferAlloc    = false;
     gpu::Memory* memory = NULL;
 
-    if (heap()->isVirtual()) {
-        bufferAlloc = true;
-    }
-    //!@todo Remove this code when VM is always on.
-    // Use zero-copy transfers for sysmem allocations or persistent memory
-    else {
-        if (owner.getMemFlags() & (CL_MEM_ALLOC_HOST_PTR |
-                                   CL_MEM_USE_HOST_PTR)) {
-            bufferAlloc = true;
-        }
-    }
-
     if (owner.asBuffer()) {
         directAccess = (settings().hostMemDirectAccess_ & Settings::HostMemBuffer)
             ? true : false;
-        memory = createBuffer(owner, directAccess, bufferAlloc);
+        memory = createBuffer(owner, directAccess);
     }
     else if (owner.asImage()) {
         directAccess = (settings().hostMemDirectAccess_ & Settings::HostMemImage)
@@ -1878,7 +1611,6 @@ bool
 Device::reallocMemory(amd::Memory& owner) const
 {
     bool directAccess   = false;
-    bool bufferAlloc    = heap()->isVirtual();
 
     // For now we have to serialize reallocation code
     amd::ScopedLock lk(*lockAsyncOps_);
@@ -1889,35 +1621,18 @@ Device::reallocMemory(amd::Memory& owner) const
     if (gpuMemory == NULL) {
         return false;
     }
-    if (gpuMemory->hb() != NULL) {
+
+    if (gpuMemory->pinOffset() == 0) {
         return true;
     }
-
-    if (bufferAlloc) {
-        if (gpuMemory->pinOffset() == 0) {
-            return true;
-        }
-        else if (NULL != owner.parent()) {
-            if (!reallocMemory(*owner.parent())) {
-                return false;
-            }
+    else if (NULL != owner.parent()) {
+        if (!reallocMemory(*owner.parent())) {
+            return false;
         }
     }
 
     if (owner.asBuffer()) {
-        // Disable remote allocation if no VM
-        if ((gpuMemory != NULL) &&
-            ((gpuMemory->memoryType() == Resource::Remote) ||
-             (gpuMemory->memoryType() == Resource::RemoteUSWC)) && !bufferAlloc) {
-            // Make sure we don't have a stale memory in VA cache before reallocation
-            // of system memory.
-            // \note: the app must unmap() memory before kernel launch
-            removeVACache(gpuMemory);
-            static const bool forceAllocHostMem = true;
-            static const bool forceCopy = true;
-            owner.allocHostMemory(owner.getHostMem(), forceAllocHostMem, forceCopy);
-        }
-        gpuMemory = createBuffer(owner, directAccess, bufferAlloc);
+        gpuMemory = createBuffer(owner, directAccess);
     }
     else if (owner.asImage()) {
         return true;
@@ -2113,24 +1828,18 @@ Device::globalFreeMemory(size_t* freeMemory) const
     if (!(const_cast<Device*>(this)->initializeHeapResources())) {
         return false;
     }
-    if (heap()->isVirtual()) {
-        gslMemInfo memInfo = {0};
-        gslCtx()->getMemInfo(&memInfo, GSL_MEMINFO_BASIC);
 
-         // Fill free memory info
-        freeMemory[TotalFreeMemory] = (memInfo.cardMemAvailableBytes +
-            memInfo.cardExtMemAvailableBytes) / Ki;
-        freeMemory[LargestFreeBlock] = std::max(memInfo.cardLargestFreeBlockBytes,
-           memInfo.cardExtLargestFreeBlockBytes) / Ki;
-        if (settings().apuSystem_) {
-            freeMemory[TotalFreeMemory] += memInfo.agpMemAvailableBytes / Ki;
-            freeMemory[LargestFreeBlock] += memInfo.agpLargestFreeBlockBytes / Ki;
-        }
-    }
-    else {
-        freeMemory[TotalFreeMemory] = static_cast<size_t>((info().globalMemSize_ -
-            static_cast<cl_ulong>(heapSize_) + heap()->freeSpace()) / Ki);
-        freeMemory[LargestFreeBlock] = freeMemory[TotalFreeMemory];
+    gslMemInfo memInfo = {0};
+    gslCtx()->getMemInfo(&memInfo, GSL_MEMINFO_BASIC);
+
+        // Fill free memory info
+    freeMemory[TotalFreeMemory] = (memInfo.cardMemAvailableBytes +
+        memInfo.cardExtMemAvailableBytes) / Ki;
+    freeMemory[LargestFreeBlock] = std::max(memInfo.cardLargestFreeBlockBytes,
+        memInfo.cardExtLargestFreeBlockBytes) / Ki;
+    if (settings().apuSystem_) {
+        freeMemory[TotalFreeMemory] += memInfo.agpMemAvailableBytes / Ki;
+        freeMemory[LargestFreeBlock] += memInfo.agpLargestFreeBlockBytes / Ki;
     }
 
     return true;
diff --git a/rocclr/runtime/device/gpu/gpudevice.hpp b/rocclr/runtime/device/gpu/gpudevice.hpp
index 4072ed9670..045ee2b40c 100644
--- a/rocclr/runtime/device/gpu/gpudevice.hpp
+++ b/rocclr/runtime/device/gpu/gpudevice.hpp
@@ -125,8 +125,7 @@ protected:
         const CALdeviceattribs& calAttr,    //!< CAL device attributes info
         const gslMemInfo&  memInfo,         //!< GSL mem info
         size_t  maxTextureSize,             //!< Maximum texture size supported in HW
-        uint    numComputeRings,            //!< Number of compute rings
-        bool    isVirtualMode               //!< Device is in virtual mode
+        uint    numComputeRings             //!< Number of compute rings
         );
 };
 
@@ -184,6 +183,32 @@ private:
 class Device : public NullDevice, public CALGSLDevice
 {
 public:
+    class Heap : public amd::EmbeddedObject
+    {
+    public:
+        //! The size of a heap element in bytes
+        static const size_t ElementSize = 4;
+
+        //! The type of a heap element in bytes
+        static const cmSurfFmt ElementType = CM_SURF_FMT_R32I;
+
+        Heap(): resource_(NULL), baseAddress_(0) {}
+
+        bool create(
+            Device& device      //!< GPU device object
+            );
+
+        //! Gets the GPU resource associated with the global heap
+        const Resource& resource() const { return *resource_; }
+
+        //! Returns the base virtual address of the heap
+        uint64_t baseAddress() const { return baseAddress_; }
+
+    protected:
+        Resource*   resource_;      //!< GPU resource referencing the heap memory
+        uint64_t    baseAddress_;   //!< Virtual heap base address
+    };
+
     //! Locks any access to the virtual GPUs
     class ScopedLockVgpus : public amd::StackObject {
     public:
@@ -377,12 +402,6 @@ public:
     //! Destructor for the physical GPU device
     virtual ~Device();
 
-    //! Reallocates current global heap
-    bool reallocHeap(
-        size_t  size,           //!< requested size for reallocation
-        bool    remoteAlloc     //!< allocate the new heap in remote memory
-        );
-
     //! Instantiate a new virtual device
     device::VirtualDevice* createVirtualDevice(
         amd::CommandQueue*  queue = NULL
@@ -442,15 +461,10 @@ public:
         ) const;
 
     //! Gets the GPU resource associated with the global heap
-    const Resource& globalMem() const { return heap_->resource(); }
+    const Resource& globalMem() const { return heap_.resource(); }
 
     //! Gets the global heap object
-    const Heap* heap() const { return heap_; }
-
-    //! Allocates a heap block from the global heap
-    HeapBlock* allocHeapBlock(
-        size_t size             //!< The heap block size for allocation
-        ) const;
+    const Heap& heap() const { return heap_; }
 
     //! Gets the memory object for the dummy page
     amd::Memory* dummyPage() const { return dummyPage_; }
@@ -566,16 +580,10 @@ private:
     //! Sends the stall command to all queues
     bool stallQueues();
 
-    //! Buffer allocation from static heap (no VM mode only)
-    gpu::Memory* createBufferFromHeap(
-        amd::Memory&    owner           //!< Abstraction layer memory object
-        ) const;
-
     //! Buffer allocation
     gpu::Memory* createBuffer(
         amd::Memory&    owner,          //!< Abstraction layer memory object
-        bool            directAccess,   //!< Use direct host memory access
-        bool            bufferAlloc     //!< If TRUE, then don't use heap
+        bool            directAccess    //!< Use direct host memory access
         ) const;
 
     //! Image allocation
@@ -591,8 +599,7 @@ private:
         );
 
     amd::Context*   context_;       //!< A dummy context for internal allocations
-    size_t      heapSize_;          //!< The global heap size
-    Heap*       heap_;              //!< GPU heap manager
+    Heap            heap_;          //!< GPU global heap
     amd::Memory*    dummyPage_;     //!< A dummy page for NULL pointer
 
     amd::Monitor*   lockAsyncOps_;  //!< Lock to serialise all async ops on this device
diff --git a/rocclr/runtime/device/gpu/gpuheap.cpp b/rocclr/runtime/device/gpu/gpuheap.cpp
deleted file mode 100644
index 28cc32ed29..0000000000
--- a/rocclr/runtime/device/gpu/gpuheap.cpp
+++ /dev/null
@@ -1,536 +0,0 @@
-//! Implementation of GPU device memory management
-
-#include "top.hpp"
-#include "thread/thread.hpp"
-#include "thread/monitor.hpp"
-#include "device/device.hpp"
-#include "device/gpu/gpuheap.hpp"
-#include "device/gpu/gpudevice.hpp"
-
-#include <string>
-#include <fstream>
-#include <sstream>
-#include <iostream>
-
-//! Turn this on to enable sanity checks before and after every heap operation.
-#if DEBUG
-#define EXTRA_HEAP_CHECKS   1
-#endif // DEBUG
-
-namespace gpu {
-
-// The GPU heap. Very simple implementation for now.
-Heap::Heap(
-    Device& device)
-    : resource_(NULL)
-    , freeList_(NULL)
-    , busyList_(NULL)
-    , freeSize_(0)
-    , device_(device)
-    , granularity_(Heap::MinGranularity)
-    , lock_("GPU heap lock", true)
-    , virtualMode_(false)
-    , baseAddress_(0)
-{
-}
-
-size_t
-Heap::granularityB() const
-{
-    return granularity_ * Heap::ElementSize;
-}
-
-bool
-Heap::create(size_t totalSize, bool remoteAlloc)
-{
-    Resource::MemoryType    memType;
-    size_t  maxHeight = device_.info().image2DMaxHeight_;
-    size_t  sizeInElements;
-    size_t  npages;
-
-    freeSize_ = totalSize;
-
-    sizeInElements = (totalSize + Heap::ElementSize - 1) / Heap::ElementSize;
-
-    // Calculate best granularity given the size and device characteristics
-    npages = amd::alignUp(sizeInElements, granularity_) / granularity_;
-
-    // Create a new GPU resource
-    resource_ = new Resource(device_, sizeInElements, Heap::ElementType);
-
-    if (resource_ == NULL) {
-        return false;
-    }
-
-    memType = (remoteAlloc) ? Resource::RemoteUSWC : Resource::Local;
-
-    if (!resource_->create(memType, NULL, true)) {
-        return false;
-    }
-
-    // Set up initial free list
-    freeList_ = new HeapBlock(this, npages * granularityB(), 0, NULL, NULL);
-    if (freeList_ == NULL) {
-        return false;
-    }
-
-    guarantee(isSane());
-    return true;
-}
-
-Heap::~Heap()
-{
-    amd::ScopedLock k(lock_);
-
-    guarantee(isSane());
-
-    // Release all heap blocks
-    HeapBlock *walk, *next;
-    walk = busyList_;
-    while (walk) {
-        next = walk->next_;
-        walk->free();
-        walk = next;
-    }
-
-    walk = freeList_;
-    while (walk) {
-        next = walk->next_;
-        delete walk;
-        walk = next;
-    }
-
-    // Release resource
-    delete resource_;
-}
-
-HeapBlock*
-Heap::alloc(size_t size)
-{
-    amd::ScopedLock k(lock_);
-    HeapBlock* walk = freeList_;
-    HeapBlock* best = NULL;
-
-    guarantee(isSane());
-
-    // Round size
-    size = amd::alignUp(size, granularityB());
-
-    // Walk the free list looking for a suitable block (currently best-fit)
-    //! @todo:dgladdin: experiment with switching back to first-fit
-
-    while (walk) {
-        if ((walk->size_ > size) &&
-            (best == NULL || walk->size_ < best->size_)) {
-                best = walk;
-        }
-        else if (walk->size_ == size) {
-            // No need to split, just move to busy list
-            detachBlock(&freeList_, walk);
-            walk->inUse_ = true;
-            insertBlock(&busyList_, walk);
-            guarantee(isSane());
-            freeSize_ -= size;
-            return walk;
-        }
-    walk = walk->next_;
-    }
-
-    if (best != NULL) {
-        // Got one, but need to split it. Keep first part in free list,
-        // put second part into busy list.
-        HeapBlock *newblock = splitBlock(best, size);
-        newblock->inUse_ = true;
-        insertBlock(&busyList_, newblock);
-        guarantee(isSane());
-        freeSize_ -= size;
-        return newblock;
-    }
-
-    // No free block available
-    guarantee(isSane());
-    return NULL;
-}
-
-bool
-Heap::copyTo(Heap* heap)
-{
-    HeapBlock    *walk;
-
-    walk = busyList_;
-    while (walk) {
-        if (walk->getMemory() != NULL) {
-            HeapBlock* hb = heap->alloc(walk->size_);
-            if (hb == NULL) {
-                return false;
-            }
-            hb->setMemory(walk->getMemory());
-
-            walk->destroyViewsMemory();
-            if (!walk->getMemory()->reallocate(hb, &(heap->resource()))) {
-                return false;
-            }
-
-            if (!walk->reallocateViews(hb,
-                    static_cast<size_t>(hb->offset_ - walk->offset_))) {
-                return false;
-            }
-        }
-        walk = walk->next_;
-    }
-
-    return true;
-}
-
-void
-Heap::free(HeapBlock* blk)
-{
-    amd::ScopedLock k(lock_);
-    guarantee(isSane());
-    detachBlock(&busyList_, blk);
-    blk->inUse_ = false;
-    freeSize_ += blk->size_;
-    mergeBlock(&freeList_, blk);
-    guarantee(isSane());
-}
-
-void
-Heap::detachBlock(HeapBlock** list, HeapBlock* blk)
-{
-    // Sanity checks
-    guarantee(isSane());
-
-    if (*list == blk) {
-        *list = blk->next_;
-    }
-
-    if (blk->prev_) {
-       blk->prev_->next_ = blk->next_;
-    }
-    if (blk->next_) {
-        blk->next_->prev_ = blk->prev_;
-    }
-    // no heap sanity check as blk is now floating
-}
-
-void
-Heap::insertBlock(HeapBlock** head, HeapBlock* blk)
-{
-     if (NULL == *head) {
-        *head = blk;
-        blk->prev_ = NULL;
-        blk->next_ = NULL;
-        guarantee(isSane());
-        return;
-    }
-
-    // Find the place to insert it at
-    HeapBlock* walk = *head;
-    while (walk->next_ && walk->next_->offset_ < blk->offset_) {
-        walk = walk->next_;
-    }
-
-    // Insert it
-    if (walk == *head) {
-        if (walk->offset_ >= blk->offset_) {
-            *head = blk;
-            blk->prev_ = NULL;
-            blk->next_ = walk;
-            walk->prev_ = *head;
-            guarantee(isSane());
-            return;
-        }
-    }
-
-    blk->next_ = walk->next_;
-    blk->prev_ = walk;
-    if (walk->next_) {
-        walk->next_->prev_ = blk;
-    }
-    walk->next_ = blk;
-    guarantee(isSane());
-}
-
-HeapBlock*
-Heap::splitBlock(HeapBlock* blk, size_t tailsize)
-{
-    // Sanity checks
-
-    guarantee(isSane());
-    guarantee(blk->size_ > tailsize && "block too small to split as requested");
-    guarantee(!blk->inUse_ && "can't split in-use block");
-
-    // Create a new block
-
-    HeapBlock* nb = new HeapBlock(blk->owner_, tailsize,
-                                  blk->offset_ + blk->size_ - tailsize);
-
-    // Resize the old block
-
-    blk->size_ = blk->size_ - tailsize;
-    return nb;  // no heap sanity check here as the new block hasn't been plugged in yet
-}
-
-//! Join two blocks, transferring the size of the second into the first and deleting
-//! the second. Utility fn for mergeBlock()
-
-static void
-join2Blocks(HeapBlock* first, HeapBlock* second)
-{
-    // Sanity checks
-
-    guarantee(first->size_ > 0 && "first block invalid");
-    guarantee(!first->inUse_ && "can't join  an in-use block");
-    guarantee(second->size_ > 0 && "second block invalid");
-    guarantee(first->offset_ + first->size_ == second->offset_);
-
-    // Do the join
-    first->size_ = first->size_ + second->size_;
-    first->next_ = second->next_;
-    if (second->next_) {
-        second->next_->prev_ = first;
-    }
-    delete second;
-}
-
-//! Insert a block into a list, merging it with adjacent blocks if possible. Must be called
-//! under a lock, cannot be used on in-use blocks or blocks with an associated resource alias.
-
-void
-Heap::mergeBlock(HeapBlock** head, HeapBlock* blk)
-{
-    insertBlock(head, blk);
-
-    // Merge with successor if possible
-    if ((blk->next_ != NULL) &&
-        (blk->offset_ + blk->size_ == blk->next_->offset_)) {
-        join2Blocks(blk, blk->next_);
-    }
-
-    // Merge with predecessor if possible
-    if ((blk->prev_ != NULL) &&
-        (blk->prev_->offset_ + blk->prev_->size_ == blk->offset_)) {
-        join2Blocks(blk->prev_, blk);
-    }
-
-    guarantee(isSane());
-}
-
-//! Sanity check for both types of block (helper function for Heap::isSane())
-
-static bool
-isBlockSane(HeapBlock* b)
-{
-    return (b->owner_ != NULL
-        && (b->next_ == NULL || b->next_->prev_ == b)
-        && (b->prev_ == NULL || b->prev_->next_ == b));
-}
-
-//! Sanity check for an individual free block (helper function for Heap::isSane())
-static bool
-isFreeBlockSane(HeapBlock* b)
-{
-    if (isBlockSane(b) && !b->inUse_) {
-        return true;
-    } else {
-        return false;
-    }
-}
-
-//! Sanity check for an individual busy block (helper function for Heap::isSane())
-static bool
-isBusyBlockSane(HeapBlock* b)
-{
-    if (isBlockSane(b) && b->inUse_) {
-        return true;
-    } else {
-        return false;
-    }
-}
-
-//! Sanity check for the heap.
-
-bool
-Heap::isSane() const
-{
-    // If we got this far, everything is (probably) OK
-#if EXTRA_HEAP_CHECKS
-    HeapBlock* walkFree = freeList_;    // Free list position
-    HeapBlock* walkBusy = busyList_;    // Busy list position
-    size_t offset = 0;                  // Current offset
-
-    // We can have zero lists if Heap allocation fails
-    if (walkFree == NULL && walkBusy == NULL) {
-        return true;
-    }
-
-    // Walk both lists in parallel
-    while (walkFree != NULL || walkBusy != NULL) {
-        if (walkFree != NULL && walkFree->offset_ == offset) {
-            if (!isFreeBlockSane(walkFree)) {
-                return false;
-            }
-            offset += walkFree->size_;
-            walkFree = walkFree->next_;
-        }
-        else if (walkBusy != NULL && walkBusy->offset_ == offset) {
-            if (!isBusyBlockSane(walkBusy)) {
-                return false;
-            }
-            offset += walkBusy->size_;
-            walkBusy = walkBusy->next_;
-        }
-        else {
-            return false;
-        }
-    }
-
-#endif // EXTRA_HEAP_CHECKS
-    return true;
-}
-
-void
-HeapBlock::destroyViewsMemory()
-{
-    if ((parent_ != NULL) && (0 == views_.size())) {
-        memory_->free();
-    }
-    else if (views_.size() != 0) {
-        std::list<HeapBlock*>::const_iterator it;
-        for (it = views_.begin(); it != views_.end(); ++it) {
-            (*it)->destroyViewsMemory();
-        }
-    }
-}
-
-bool
-HeapBlock::reallocateViews(HeapBlock* parent, size_t shift)
-{
-    if (views_.size() != 0) {
-        std::list<HeapBlock*>::const_iterator it;
-
-        // Loop through all views and reallocate them
-        for (it = views_.begin(); it != views_.end(); ++it) {
-            // Get the view HeapBlock
-            HeapBlock* hb = (*it);
-
-            // Readjust the offset
-            hb->offset_ += shift;
-            // Add to the list if we have a new parent
-            if (parent != this) {
-                parent->addView(hb);
-            }
-
-            // Reallocate memory
-            hb->memory_->reallocate(hb, parent->getMemory());
-
-            // Process a view on view if available
-            if (!hb->reallocateViews(hb, shift)) {
-                return false;
-            }
-        }
-
-        // Destroy old list
-        if (parent != this) {
-            views_.clear();
-        }
-    }
-    return true;
-}
-
-//! Destructor. Frees the block if in use and does some final sanity checks.
-HeapBlock::~HeapBlock()
-{
-    if (NULL != owner_) {
-        if (inUse_) {
-            owner_->free(this);
-        }
-    }
-    else {
-        // View destruction
-        if (parent_ != NULL) {
-            assert(((parent_->getMemory() != NULL) && (parent_->getMemory()->owner() != NULL)));
-            amd::ScopedLock lock(parent_->getMemory()->owner()->lockMemoryOps());
-            parent_->removeView(this);
-        }
-    }
-    guarantee(size_ > 0 && "destructor called for zero-size heap block (destructor called twice?)");
-    size_ = 0; // Mark as invalid
-
-    if (views_.size() != 0) {
-        LogError("Can't destroy a resource if we still have views!");
-    }
-}
-
-void
-HeapBlock::free()
-{
-    if (NULL != owner_) {
-        owner_->free(this);
-    }
-    else {
-        // It's a view. Destroy the object
-        delete this;
-    }
-}
-
-VirtualHeap::VirtualHeap(
-    Device& device)
-    : Heap(device)
-{
-    virtualMode_ = true;
-}
-
-bool
-VirtualHeap::create(
-    size_t  totalSize,
-    bool    remoteAlloc)
-{
-    // Create a new GPU resource
-    resource_ = new Resource(device_, 0, Heap::ElementType);
-    if (resource_ == NULL) {
-        return false;
-    }
-
-    if (!resource_->create(Resource::Heap)) {
-        return false;
-    }
-
-    if (!device_.settings().hsail_) {
-        baseAddress_ = resource_->gslResource()->getSurfaceAddress();
-    }
-    return true;
-}
-
-VirtualHeap::~VirtualHeap()
-{
-}
-
-HeapBlock*
-VirtualHeap::alloc(size_t size)
-{
-    assert(false && "Dead branch!");
-    return NULL;
-}
-
-void
-VirtualHeap::free(HeapBlock* blk)
-{
-    assert(false && "Dead branch!");
-}
-
-bool
-VirtualHeap::copyTo(Heap* heap)
-{
-    assert(false && "Dead branch!");
-    return false;
-}
-
-bool
-VirtualHeap::isSane(void) const
-{
-    assert(false && "Dead branch!");
-    return true;
-}
-
-} // namespace gpu
diff --git a/rocclr/runtime/device/gpu/gpuheap.hpp b/rocclr/runtime/device/gpu/gpuheap.hpp
deleted file mode 100644
index b38f316446..0000000000
--- a/rocclr/runtime/device/gpu/gpuheap.hpp
+++ /dev/null
@@ -1,225 +0,0 @@
-//! Declarations for GPU memory management
-
-#ifndef GPUHEAP_HPP_
-#define GPUHEAP_HPP_
-
-#include "top.hpp"
-#include "thread/atomic.hpp"
-#include "device/gpu/gpudefs.hpp"
-
-/*! \addtogroup GPU
- *  @{
- */
-
-//! GPU Device Implementation
-
-namespace gpu {
-
-class Device;
-class Heap;
-class Resource;
-class Memory;
-class VirtualGPU;
-
-//! @todo:dgladdin: The heap list should be singly-linked
-
-//! \brief A block on the GPU heap.
-//!
-//! Note that no code outside of the gpumemory.hpp/.cpp pair should touch this
-//! class directly as it is not thread-safe. In general, this class should be
-//! pretty much a struct and contain as little functionality as possible - just
-//!  a constructor, destructor.
-//!
-//! Any other methods - in particular, anything that talks to CAL - should be no
-//! more than proxies for functionality implemented in Heap, as Heap is aware
-//! of the lock state.
-
-class HeapBlock : public amd::HeapObject
-{
-public:
-    //! Constructor
-    HeapBlock(
-        Heap* owner = NULL,
-        size_t size = 0,
-        size_t offset = 0,
-        HeapBlock* next=NULL,
-        HeapBlock* prev=NULL)
-        : owner_(owner)
-        , size_(size)
-        , offset_(offset)
-        , next_(next)
-        , prev_(prev)
-        , inUse_(false)
-        , parent_(NULL)
-        , memory_(NULL)
-        {}
-
-    //! Destructor does some sanity checks.
-    ~HeapBlock();
-
-    //! Frees a heap block, returning its memory to the owning heap (proxy)
-    void free();
-
-    //! Sets the GPU memory object associated with the heap block
-    void setMemory(Memory* memory) { memory_ = memory; }
-
-    //! Gets the GPU memory object associated with the heap block
-    Memory* getMemory() const { return memory_; }
-
-    //! Adds a heapblock view to the list of views
-    void addView(HeapBlock* hb)
-        { views_.push_back(hb);  hb->parent_ = this; }
-
-    //! Removes a heapblock view from the list of views
-    void removeView(HeapBlock* hb) { views_.remove(hb); }
-
-    //! Destroys all views
-    void destroyViewsMemory();
-
-    //! Creates all new views
-    bool reallocateViews(
-        HeapBlock*  parent,     //!< Parent heap block
-        size_t      shift       //!< The new HeapBlock shift
-        );
-
-    //! Gets the offset
-    size_t offset() const { return offset_; }
-
-    Heap*       owner_;     //!< Heap that owns this block
-    size_t      size_;      //!< Size of the block in bytes
-    size_t      offset_;    //!< Offset of this block in the heap
-    HeapBlock*  next_;      //!< Next block on the list, or NULL
-    HeapBlock*  prev_;      //!< Previous block on the list, or NULL
-    bool        inUse_;     //!< true if the block is in use
-    HeapBlock*  parent_;    //!< The parent heap block for a view
-
-private:
-    //! Disable copy constructor
-    HeapBlock(const HeapBlock&);
-
-    //! Disable assignment
-    HeapBlock& operator=(const HeapBlock&);
-
-    Memory*     memory_;    //!< Memory object associated with the heap block
-    std::list<HeapBlock*>   views_; //!< The list of all allocated views
-};
-
-class Heap : public amd::HeapObject
-{
-public:
-    //! Minimal supported CAL granularity = 256 bytes / ElementSize
-    static const size_t MinGranularity = 64;
-
-    //! The size of a heap element in bytes
-    static const size_t ElementSize = 4;
-
-    //! The type of a heap element in bytes
-    static const cmSurfFmt ElementType = CM_SURF_FMT_R32I;
-
-    Heap(
-        Device& device      //!< GPU device object
-        );
-
-    virtual bool create(
-        size_t  totalSize,  //!< total size of the allocated heap (bytes)
-        bool    remoteAlloc //!< allocate the heap in remote memory
-        );
-
-    //! Heap destructor
-    virtual ~Heap();
-
-    /*!
-     * \brief Allocates memory from a heap (best-fit).
-     * We round up to 4k granularity for alignment.
-     *
-     * \return A pointer to allocated heap block object.
-     */
-    virtual HeapBlock* alloc(
-        size_t size     //! The allocation size
-        );
-
-    //! Release memory back to a heap.
-    virtual void free(HeapBlock* blk);
-
-    //! Copies this heap to another
-    virtual bool copyTo(Heap* heap);
-
-    //! Gets the GPU resource associated with the global heap
-    const Resource& resource() const { return *resource_; }
-
-    //! Read the page size (bytes)
-    size_t granularityB() const;
-
-    //! Read the total free space (bytes)
-    size_t freeSpace() const { return freeSize_; }
-
-    virtual bool isSane(void) const;    //!< Checks heap sanity
-
-    //! Returns true if we have a virtual heap
-    bool isVirtual() const { return virtualMode_; }
-
-    //! Returns the base virtual address of the heap
-    uint64_t baseAddress() const { return baseAddress_; }
-
-private:
-    //! Insert a block into a list. Must be called under a lock.
-    void insertBlock(HeapBlock** list, HeapBlock* node);
-
-    //! Merge a block into a list. Must be called under a lock.
-    void mergeBlock(HeapBlock** list, HeapBlock* node);
-
-    //! Remove a block from a list. Must be called under a lock.
-    void detachBlock(HeapBlock** list, HeapBlock* node);
-
-    //! Split a block into two pieces
-    HeapBlock* splitBlock(HeapBlock* node, size_t size);
-
-protected:
-    Resource*   resource_;      //!< GPU resource referencing the heap memory
-    HeapBlock*  freeList_;      //!< Head block for free list
-    HeapBlock*  busyList_;      //!< Head block for busy list
-    size_t      freeSize_;      //!< total free size of the heap
-    Device&     device_;        //!< Device that owns this heap
-    size_t      granularity_;   //!< Size of an allocation page
-    amd::Monitor    lock_;      //!< Lock to serialise heap accesses
-    bool        virtualMode_;   //!< Virtual mode
-    uint64_t    baseAddress_;   //!< Virtual heap base address
-};
-
-class VirtualHeap : public Heap
-{
-public:
-    VirtualHeap(
-        Device& device      //!< GPU device object
-        );
-
-    virtual bool create(
-        size_t  totalSize,  //!< total size of the allocated heap (bytes)
-        bool    remoteAlloc //!< allocate the heap in remote memory
-        );
-
-    //! Heap destructor
-    virtual ~VirtualHeap();
-
-    /*!
-     * \brief Allocates memory from a heap (best-fit).
-     * We round up to 4k granularity for alignment.
-     *
-     * \return A pointer to allocated heap block object.
-     */
-    virtual HeapBlock* alloc(
-        size_t size     //! The allocation size
-        );
-
-    //! Release memory back to a heap.
-    virtual void free(HeapBlock* blk);
-
-    //! Copies this heap to another
-    virtual bool copyTo(Heap* heap);
-
-    virtual bool isSane(void) const;    //!< Checks heap sanity
-};
-
-} // namespace gpu
-
-#endif // GPUHEAP_HPP_
diff --git a/rocclr/runtime/device/gpu/gpukernel.cpp b/rocclr/runtime/device/gpu/gpukernel.cpp
index 0ffabf5468..8f511311e7 100644
--- a/rocclr/runtime/device/gpu/gpukernel.cpp
+++ b/rocclr/runtime/device/gpu/gpukernel.cpp
@@ -824,17 +824,6 @@ Kernel::create(
     // Initialize the kernel parameters
     bool    result = initParameters();
 
-    if (!dev().heap()->isVirtual()) {
-        amd::option::Options *options = nullProg().getCompilerOptions();
-        // @todo Remove this. This is a hack for no VM mode
-        if (!options->oVariables->EnableDumpKernel) {
-            if (!name().compare(BlitName[KernelBlitManager::BlitCopyImageToBuffer]) ||
-                !name().compare(BlitName[KernelBlitManager::BlitCopyBufferToImage])) {
-                blitKernelHack_ = true;
-            }
-        }
-    }
-
     // Wave limiter needs to be initialized after kernel metadata is parsed
     // Since it depends on it.
     waveLimiter_.enable();
@@ -855,7 +844,6 @@ Kernel::Kernel(
     const Program&      prog,
     const InitData*     initData)
     : NullKernel(name, gpuDev, prog)
-    , blitKernelHack_(false)
     , waveLimiter_(this)
 {
     hwPrivateSize_ = 0;
@@ -1603,10 +1591,6 @@ Kernel::debug(VirtualGPU& gpu) const
 {
     std::fstream    stubWrite;
     address         src = NULL;
-    if (!dev().heap()->isVirtual()) {
-        src  = reinterpret_cast<address>
-            (const_cast<Resource&>(dev().globalMem()).map(&gpu));
-    }
 
     std::cerr << "--- " << name_ << " ---" << std::endl;
     for (uint i = 0; i < arguments_.size(); ++i) {
@@ -1689,9 +1673,6 @@ Kernel::debug(VirtualGPU& gpu) const
             stubWrite.close();
         }
     }
-    if (!dev().heap()->isVirtual()) {
-        const_cast<Resource&>(dev().globalMem()).unmap(&gpu);
-    }
 }
 
 bool
@@ -1824,18 +1805,10 @@ Kernel::setArgument(
                     type = ArgumentBuffer;
                 }
                 else {
-                    if (blitKernelHack_) {
-                        // Bind global buffer to UAV this buffer is bound to
-                        if (!bindResource(gpu, *gpuMem, 0, GlobalBuffer, uavRaw_)) {
-                            return false;
-                        }
-                    }
-                    else {
-                        // Bind global buffer to UAV this buffer is bound to
-                        if (!bindResource(gpu, dev().globalMem(), 0,
-                            GlobalBuffer, uavRaw_)) {
-                            return false;
-                        }
+                    // Bind global buffer to UAV this buffer is bound to
+                    if (!bindResource(gpu, dev().globalMem(), 0,
+                        GlobalBuffer, uavRaw_)) {
+                        return false;
                     }
                 }
 
@@ -1848,11 +1821,9 @@ Kernel::setArgument(
 
                 // Update offset only if we bind HeapBuffer or
                 // it's global address space in UAV setup on SI+
-                if (!blitKernelHack_) {
-                    offset += gpuMem->hbOffset();
-                    if (!forceZeroOffset) {
-                        assert((offset != 0) && "Offset 0 with a real allocation!");
-                    }
+                offset += gpuMem->hbOffset();
+                if (!forceZeroOffset) {
+                    assert((offset != 0) && "Offset 0 with a real allocation!");
                 }
                 gpu.addVmMemory(gpuMem);
             }
@@ -2253,10 +2224,9 @@ Kernel::bindResource(
 
     gslMemObject gslMem = NULL;
     // Use global address space on SI+ for UAV setup
-    if (((type == ArgumentBuffer) || (type == ArgumentCbID) ||
-         (type == ArgumentUavID) || (type == ArgumentPrintfID)) &&
-        !blitKernelHack_) {
-        gslMem = dev().heap()->resource().gslResource();
+    if ((type == ArgumentBuffer) || (type == ArgumentCbID) ||
+        (type == ArgumentUavID) || (type == ArgumentPrintfID)) {
+        gslMem = dev().heap().resource().gslResource();
     }
     else {
         gslMem = resource.gslResource();
@@ -2803,7 +2773,7 @@ NullKernel::parseArguments(const std::string& metaData, uint* uavRefCount)
         case KernelArg::PointerPrivate:
             // Check if can't use a dedicated UAV,
             // so realloc memory in the heap
-            arg->memory_.realloc_ = isRealloc();
+            arg->memory_.realloc_ = false;
             arg->memory_.uavBuf_ = true;
             break;
         case KernelArg::PointerHwConst:
diff --git a/rocclr/runtime/device/gpu/gpukernel.hpp b/rocclr/runtime/device/gpu/gpukernel.hpp
index b46242ec2d..c89b9e1589 100644
--- a/rocclr/runtime/device/gpu/gpukernel.hpp
+++ b/rocclr/runtime/device/gpu/gpukernel.hpp
@@ -450,9 +450,6 @@ public:
     uint  instructionCnt() const { return instructionCnt_; }
 
 protected:
-    //! Returns TRUE if memory should be reallocated, returns FALSE always for NullDevice
-    virtual bool isRealloc() const { return false; }
-
     /*! \brief Parses the metadata structure for the kernel,
      *   provided by the OpenCL compiler
      *
@@ -673,9 +670,6 @@ protected:
      */
     bool initConstBuffers();
 
-    //! Returns TRUE if memory should be reallocated, returns FALSE always for NullDevice
-    virtual bool isRealloc() const { return !dev().heap()->isVirtual(); }
-
 private:
     //! Disable copy constructor
     Kernel(const Kernel&);
@@ -771,9 +765,6 @@ private:
     uint    hwPrivateSize_;     //!< initial HW private size
     uint    hwLocalSize_;       //!< initial HW local size
 
-    //! @todo remove the blit kernel hack
-    bool    blitKernelHack_;    //!< No VM hack for kernel blit
-
     WaveLimiterManager waveLimiter_; //!< adaptively control number of waves
 };
 
diff --git a/rocclr/runtime/device/gpu/gpumemory.cpp b/rocclr/runtime/device/gpu/gpumemory.cpp
index 3dacc145fc..0534ffe201 100644
--- a/rocclr/runtime/device/gpu/gpumemory.cpp
+++ b/rocclr/runtime/device/gpu/gpumemory.cpp
@@ -30,39 +30,24 @@ namespace gpu {
 Memory::Memory(
     const Device&   gpuDev,
     amd::Memory&    owner,
-    HeapBlock*      hb,
     size_t          size)
     : device::Memory(owner)
-    , Resource(gpuDev, ((hb) ? hb->size_ : size) / Heap::ElementSize, Heap::ElementType)
-    , hb_(hb)
+    , Resource(gpuDev, size / Device::Heap::ElementSize, Device::Heap::ElementType)
 {
     init();
 
-    if (NULL != hb_) hb_->setMemory(this);
-
     if (owner.parent() != NULL) {
         flags_ |= SubMemoryObject;
     }
 }
 
-Memory::Memory(
-    const Device&   gpuDev,
-    HeapBlock&      hb)
-    : device::Memory(hb.size_)
-    , Resource(gpuDev, hb.size_ / Heap::ElementSize, Heap::ElementType)
-    , hb_(&hb)
-{
-    init();
-    hb.setMemory(this);
-}
-
 Memory::Memory(
     const Device&   gpuDev,
     size_t          size)
     : device::Memory(size)
     , Resource(gpuDev,
-        amd::alignUp(size, Heap::ElementSize) / Heap::ElementSize, Heap::ElementType)
-    , hb_(NULL)
+        amd::alignUp(size, Device::Heap::ElementSize) /
+            Device::Heap::ElementSize, Device::Heap::ElementType)
 {
     init();
 }
@@ -75,7 +60,6 @@ Memory::Memory(
     )
     : device::Memory(owner)
     , Resource(gpuDev, width, format)
-    , hb_(NULL)
 {
     init();
 
@@ -92,7 +76,6 @@ Memory::Memory(
     )
     : device::Memory(size)
     , Resource(gpuDev, width, format)
-    , hb_(NULL)
 {
     init();
 }
@@ -110,7 +93,6 @@ Memory::Memory(
     )
     : device::Memory(owner)
     , Resource(gpuDev, width, height, depth, format, chOrder, imageType, mipLevels)
-    , hb_(NULL)
 {
     init();
 
@@ -132,7 +114,6 @@ Memory::Memory(
     )
     : device::Memory(size)
     , Resource(gpuDev, width, height, depth, format, chOrder, imageType, mipLevels)
-    , hb_(NULL)
 {
     init();
 }
@@ -197,14 +178,9 @@ Memory::create(
             break;
         case Resource::Remote:
         case Resource::RemoteUSWC:
-            // @todo Enable unconditional optimization for remote memory
-            if ((owner() != NULL &&
-                owner()->getMemFlags() & CL_MEM_ALLOC_HOST_PTR) ||
-                (hb() == NULL)) {
-                if (!cal()->tiled_) {
-                    // Marks memory object for direct GPU access to the host memory
-                    flags_ |= HostMemoryDirectAccess;
-                }
+            if (!cal()->tiled_) {
+                // Marks memory object for direct GPU access to the host memory
+                flags_ |= HostMemoryDirectAccess;
             }
             break;
         case Resource::View: {
@@ -481,8 +457,8 @@ Memory::createInterop(InteropType type)
     else {
         // Allocate Resource object for interop as buffer
         interopMemory_ = new Memory(dev(), size(),
-            amd::alignUp(size(), Heap::ElementSize) / Heap::ElementSize,
-            Heap::ElementType);
+            amd::alignUp(size(), Device::Heap::ElementSize) / Device::Heap::ElementSize,
+            Device::Heap::ElementType);
 
         // Create the interop object in CAL
         if (NULL == interopMemory_ || !interopMemory_->create(memType, createParams)) {
@@ -502,14 +478,6 @@ Memory::~Memory()
     // Clean VA cache
     dev().removeVACache(this);
 
-    // Release associated heap block, if any
-    if (hb_) {
-        // Protect heap block from simultaneous release with realloc
-        amd::ScopedLock k(dev().lockAsyncOps());
-        hb_->setMemory(NULL);
-        hb_->free();
-    }
-
     delete interopMemory_;
 
     // Release associated map target, if any
@@ -531,35 +499,6 @@ Memory::~Memory()
     }
 }
 
-bool
-Memory::reallocate(HeapBlock* hb, const Resource* parent)
-{
-    Resource::ViewParams params;
-    params.size_        = hb->size_;
-    params.resource_    = parent;
-    params.memory_      = NULL;
-
-    // Check if it's a view reallocation
-    if (NULL != hb->parent_) {
-        // The offset inside the view is unchanged
-        params.offset_ = Resource::offset();
-
-        // Create a new view
-        if (Resource::create(Resource::View, &params)) {
-            hb_ = hb;
-            return true;
-        }
-    }
-    else {
-        params.offset_ = hb->offset_;
-        if (Resource::reallocate(&params)) {
-            hb_ = hb;
-            return true;
-        }
-    }
-    return false;
-}
-
 void
 Memory::syncCacheFromHost(VirtualGPU& gpu, device::Memory::SyncFlags syncFlags)
 {
@@ -814,33 +753,13 @@ Memory::createBufferView(amd::Memory& subBufferOwner)
 {
     gpu::Memory*            viewMemory;
     Resource::ViewParams    params;
-    HeapBlock*              hb = NULL;
 
     size_t  offset = subBufferOwner.getOrigin();
     size_t  size = subBufferOwner.getSize();
 
-    if (!dev().heap()->isVirtual()) {
-        if (NULL == hb_) {
-            LogError("HeapBlock must be initialized!");
-            return NULL;
-        }
-
-        hb = new HeapBlock(NULL, size, offset + hb_->offset());
-        if (hb == NULL) {
-            LogError("We don't have enough video memory!");
-            return NULL;
-        }
-        amd::ScopedLock lock(owner()->lockMemoryOps());
-        hb_->addView(hb);
-    }
-
     // Create a memory object
-    viewMemory = new gpu::Memory(dev(), subBufferOwner, hb, size);
+    viewMemory = new gpu::Memory(dev(), subBufferOwner, size);
     if (NULL == viewMemory) {
-        if (hb != NULL) {
-            hb->setMemory(NULL);
-            hb->free();
-        }
         return NULL;
     }
 
diff --git a/rocclr/runtime/device/gpu/gpumemory.hpp b/rocclr/runtime/device/gpu/gpumemory.hpp
index c6ccb4b23e..503ca42a34 100644
--- a/rocclr/runtime/device/gpu/gpumemory.hpp
+++ b/rocclr/runtime/device/gpu/gpumemory.hpp
@@ -8,7 +8,6 @@
 #include "top.hpp"
 #include "thread/atomic.hpp"
 #include "device/gpu/gpuresource.hpp"
-#include "device/gpu/gpuheap.hpp"
 #include "device/gpu/gpudevice.hpp"
 #include <map>
 
@@ -27,7 +26,6 @@ class Heap;
 class Resource;
 class Memory;
 class VirtualGPU;
-class HeapBlock;
 
 //! GPU memory object.
 //  Wrapper that can contain a heap block or an interop buffer/image.
@@ -44,14 +42,8 @@ public:
     Memory(
         const Device&   gpuDev,
         amd::Memory&    owner,
-        HeapBlock*      hb,
         size_t          size = 0);
 
-    //! Constructor (nonfat version for local scratch mem use)
-    Memory(
-        const Device&   gpuDev,
-        HeapBlock&      hb);
-
     //! Constructor (nonfat version for local scratch mem use without heap block)
     Memory(
         const Device&   gpuDev,
@@ -102,12 +94,6 @@ public:
     //! Default destructor
     ~Memory();
 
-    //! Reallocates the memory object in the new heap block
-    bool reallocate(
-        HeapBlock*      hb,     //! The new heap block for this memory object
-        const Resource* parent  //! Parent resource for view reallocaiton
-        );
-
     //! Creates the interop memory
     bool createInterop(
         InteropType     type    //!< The interop type
@@ -189,9 +175,6 @@ public:
     //! Sets interop type for this memory object
     void setInteropType(InteropType type) { interopType_ = type; }
 
-    //! Returns the HeapBlock pointer
-    const HeapBlock* hb() const { return hb_; }
-
     //! Set the owner
     void setOwner(amd::Memory* owner) { owner_ = owner; }
 
@@ -229,7 +212,6 @@ private:
     InteropType interopType_;   //!< Interop type
     Memory*     interopMemory_; //!< interop memory
 
-    HeapBlock*  hb_;            //!< Heap Block, or NULL if not in-heap memory
     Memory*     pinnedMemory_;  //!< Memory used as pinned system memory
     const Memory*   parent_;        //!< Parent memory object
 };
diff --git a/rocclr/runtime/device/gpu/gpuresource.cpp b/rocclr/runtime/device/gpu/gpuresource.cpp
index 85feb33c5e..248812b8c1 100644
--- a/rocclr/runtime/device/gpu/gpuresource.cpp
+++ b/rocclr/runtime/device/gpu/gpuresource.cpp
@@ -322,7 +322,7 @@ static uint32_t GetHSAILImageOrderType(gslChannelOrder chOrder, cmSurfFmt format
 }
 
 bool
-Resource::create(MemoryType memType, CreateParams* params, bool heap)
+Resource::create(MemoryType memType, CreateParams* params)
 {
     bool    calRes = false;
     gslMemObject  gslResource = 0;
@@ -382,7 +382,7 @@ Resource::create(MemoryType memType, CreateParams* params, bool heap)
     }
 
     // Force remote allocation if it was requested in the settings
-    if (dev().settings().remoteAlloc_ && !heap &&
+    if (dev().settings().remoteAlloc_ &&
         ((memoryType() == Local) ||
          (memoryType() == Persistent))) {
         if (dev().settings().apuSystem_ && dev().settings().viPlus_) {
@@ -515,7 +515,7 @@ Resource::create(MemoryType memType, CreateParams* params, bool heap)
                 if (memoryType() == Local) {
                     cal_.type_ = Persistent;
                 }
-                else if (!heap && (memoryType() == Persistent)) {
+                else if (memoryType() == Persistent) {
                     cal_.type_ = RemoteUSWC;
                 }
                 // Remote cacheable to uncacheable
@@ -553,11 +553,6 @@ Resource::create(MemoryType memType, CreateParams* params, bool heap)
                 reinterpret_cast<const char*>(address_) - tmpHost);
 
             pinOffset_ = hostMemOffset & 0xff;
-            //!@note GSL has a problem with the defines for flags and
-            //! view creation, so check the restriction here
-            if (!dev().heap()->isVirtual() && (pinOffset_ != 0)) {
-                return false;
-            }
 
             pinAddress = tmpHost;
             // Align width to avoid GSL useless assert with a view
@@ -629,20 +624,6 @@ Resource::create(MemoryType memType, CreateParams* params, bool heap)
                 calRes = true;
             }
 
-            // Check if it's a heap allocation
-            if (!dev().heap()->isVirtual()) {
-                if (viewOwner_ == &dev().globalMem()) {
-                    // Allocation directly from the heap
-                    hbOffset_   = static_cast<uint64_t>(view->offset_);
-                }
-                else {
-                    // Allocation from another memory object
-                    hbOffset_   = static_cast<uint64_t>(view->offset_) +
-                        viewOwner_->hbOffset();
-                }
-                hbSize_ = view->size_;
-            }
-
             if (viewOwner_->isMemoryType(Pinned)) {
                 address_ = viewOwner_->data() + offset();
             }
@@ -952,11 +933,9 @@ Resource::create(MemoryType memType, CreateParams* params, bool heap)
     cal_.tiled_ = (GSL_MOA_TILING_LINEAR != tiling) &&
         (GSL_MOA_TILING_LINEAR_GENERAL != tiling);
 
-    // Get the heap block offset if it's a virtual heap
-    if (dev().heap()->isVirtual()) {
-        hbOffset_ = gslResource->getSurfaceAddress() -
-            dev().heap()->baseAddress();
-    }
+    // Get the heap block offset
+    hbOffset_ = gslResource->getSurfaceAddress() -
+        dev().heap().baseAddress();
     hbSize_ = static_cast<uint64_t>(gslResource->getSurfaceSize());
 
     if (!dev().settings().use64BitPtr_ &&
@@ -1036,32 +1015,6 @@ Resource::create(MemoryType memType, CreateParams* params, bool heap)
     return true;
 }
 
-bool
-Resource::reallocate(CreateParams* params)
-{
-    GslResourceReference*   old;
-    GslResourceReference*   active;
-
-    old = gslRef_;
-    if (!create(memoryType(), params)) {
-        gslRef_ = old;
-        return false;
-    }
-    // Get the new active resource
-    active = gslRef_;
-    gslRef_ = old;
-
-    dev().resCopy(old->gslResource(),
-        active->gslResource(), CAL_MEMCOPY_SYNC);
-
-    // Free all old resources
-    assert(renames_.size() == 0);
-    free();
-
-    gslRef_ = active;
-    return true;
-}
-
 void
 Resource::free()
 {
@@ -1813,10 +1766,8 @@ Resource::setActiveRename(VirtualGPU& gpu, GslResourceReference* rename)
     gslRef_  = rename;
     address_ = rename->cpuAddress_;
 
-    if (dev().heap()->isVirtual()) {
-        hbOffset_ = rename->gslResource()->getSurfaceAddress() -
-            dev().heap()->baseAddress();
-    }
+    hbOffset_ = rename->gslResource()->getSurfaceAddress() -
+        dev().heap().baseAddress();
 }
 
 bool
diff --git a/rocclr/runtime/device/gpu/gpuresource.hpp b/rocclr/runtime/device/gpu/gpuresource.hpp
index 6430467760..fe4215327f 100644
--- a/rocclr/runtime/device/gpu/gpuresource.hpp
+++ b/rocclr/runtime/device/gpu/gpuresource.hpp
@@ -209,15 +209,6 @@ public:
      */
     virtual bool create(
         MemoryType  memType,        //!< memory type
-        CreateParams*   params = 0, //!< special parameters for resource allocation
-        bool        heap = false    //!< Global heap allocation for not VM mode
-        );
-
-    /*! \brief Reallocates a CAL object, associated with the resource
-     *
-     *  \return True if we succesfully reallocated a CAL resource
-     */
-    bool reallocate(
         CreateParams*   params = 0  //!< special parameters for resource allocation
         );
 
diff --git a/rocclr/runtime/device/gpu/gpusettings.cpp b/rocclr/runtime/device/gpu/gpusettings.cpp
index 8718ad21e4..3f64bdfe8a 100644
--- a/rocclr/runtime/device/gpu/gpusettings.cpp
+++ b/rocclr/runtime/device/gpu/gpusettings.cpp
@@ -50,10 +50,6 @@ Settings::Settings()
     maxRenames_         = 16;
     maxRenameSize_      = 4 * Mi;
 
-    // The global heap settings
-    heapSize_           = GPU_INITIAL_HEAP_SIZE * Mi;
-    heapSizeGrowth_     = GPU_HEAP_GROWTH_INCREMENT * Mi;
-
     imageSupport_       = false;
     hwLDSSize_          = 0;
 
diff --git a/rocclr/runtime/device/gpu/gpusettings.hpp b/rocclr/runtime/device/gpu/gpusettings.hpp
index 6fe4e974bc..fca4a2c6c4 100644
--- a/rocclr/runtime/device/gpu/gpusettings.hpp
+++ b/rocclr/runtime/device/gpu/gpusettings.hpp
@@ -82,8 +82,6 @@ public:
     size_t  stagedXferSize_;    //!< Staged buffer size
     uint    maxRenames_;        //!< Maximum number of possible renames
     uint    maxRenameSize_;     //!< Maximum size for all renames
-    size_t  heapSize_;          //!< The global heap size
-    size_t  heapSizeGrowth_;    //!< The global heap size growth
     uint    hwLDSSize_;         //!< HW local data store size
     uint    maxWorkGroupSize_;  //!< Requested workgroup size for this device
     uint    hostMemDirectAccess_;   //!< Enables direct access to the host memory
diff --git a/rocclr/runtime/device/gpu/gpuvirtual.cpp b/rocclr/runtime/device/gpu/gpuvirtual.cpp
index e78f63db50..6cc3eae985 100644
--- a/rocclr/runtime/device/gpu/gpuvirtual.cpp
+++ b/rocclr/runtime/device/gpu/gpuvirtual.cpp
@@ -517,10 +517,6 @@ VirtualGPU::create(
             // Fall through ...
         case Settings::BlitEngineCAL:
         case Settings::BlitEngineKernel:
-            if (!dev().heap()->isVirtual()) {
-                blitSetup.disableReadBufferRect_    = true;
-                blitSetup.disableWriteBufferRect_   = true;
-            }
             // use host blit for HW debug
             if (dev().settings().enableHwDebug_) {
                 blitSetup.disableCopyImageToBuffer_   = true;
@@ -3166,23 +3162,21 @@ VirtualGPU::profilingCollectResults(CommandBatch* cb, const amd::Event* waitingE
 bool
 VirtualGPU::addVmMemory(const Resource* resource)
 {
-    if (dev().heap()->isVirtual()) {
-        uint*    cnt = &cal_.memCount_;
-        (*cnt)++;
-        // Reallocate array if kernel uses more memory objects
-        if (numVmMems_ < *cnt) {
-            gslMemObject* tmp;
-            tmp = new gslMemObject [*cnt];
-            if (tmp == NULL) {
-                return false;
-            }
-            memcpy(tmp, vmMems_, sizeof(gslMemObject) * numVmMems_);
-            delete [] vmMems_;
-            vmMems_ = tmp;
-            numVmMems_ = *cnt;
+    uint*    cnt = &cal_.memCount_;
+    (*cnt)++;
+    // Reallocate array if kernel uses more memory objects
+    if (numVmMems_ < *cnt) {
+        gslMemObject* tmp;
+        tmp = new gslMemObject [*cnt];
+        if (tmp == NULL) {
+            return false;
         }
-        vmMems_[*cnt - 1] = resource->gslResource();
+        memcpy(tmp, vmMems_, sizeof(gslMemObject) * numVmMems_);
+        delete [] vmMems_;
+        vmMems_ = tmp;
+        numVmMems_ = *cnt;
     }
+    vmMems_[*cnt - 1] = resource->gslResource();
 
     return true;
 }
diff --git a/rocclr/runtime/device/gpu/gslbe/src/rt/GSLDevice.cpp b/rocclr/runtime/device/gpu/gslbe/src/rt/GSLDevice.cpp
index 32a3ed2c80..86ea275ac5 100644
--- a/rocclr/runtime/device/gpu/gslbe/src/rt/GSLDevice.cpp
+++ b/rocclr/runtime/device/gpu/gslbe/src/rt/GSLDevice.cpp
@@ -496,7 +496,7 @@ CALGSLDevice::SetupContext(int32 &asic_id)
     getAttribs_int(temp_cs);
     temp_cs->getMemInfo(&m_memInfo, GSL_MEMINFO_BASIC);
 
-    m_vmMode = temp_cs->getVMMode();
+    assert(temp_cs->getVMMode());
 
     m_adp->deleteContext(temp_cs);
 
@@ -1313,38 +1313,6 @@ CALGSLDevice::PerformDMACopy(gslMemObject srcMem, gslMemObject destMem, cmSurfFm
     return true;
 }
 
-void
-CALGSLDevice::resCopy(gslMemObject srcRes, gslMemObject dstRes, uint32 flags) const
-{
-    assert(m_cs != 0);
-    assert(srcRes != 0);
-    assert(dstRes != 0);
-
-    //! @note: GSL device isn't thread safe
-    amd::ScopedLock k(gslDeviceOps());
-
-    uint64 surfaceSize;
-
-    CopyType type = GetCopyType(srcRes, dstRes, 0, 0, m_allowDMA, 0, surfaceSize, 0, 0);
-
-    if (type == USE_DRMDMA)
-    {
-        m_cs->DMACopy(srcRes, 0, dstRes, 0, surfaceSize, GSL_SYNCUPLOAD_SYNC_WAIT, NULL);
-        m_cs->Flush();
-        Wait(m_cs, GSL_DRMDMA_SYNC_ATI, m_mapDMAQuery);
-    }
-    else if (type == USE_CPDMA)
-    {
-        m_cs->syncUploadRaw(srcRes, 0, dstRes, 0, surfaceSize, 0);
-        m_cs->Flush();
-        Wait(m_cs, GSL_SYNC_ATI, m_mapQuery);
-    }
-    else
-    {
-        assert(0 && "No copy engine is being used");
-    }
-}
-
 #define CPDMA_THRESHOLD 131072
 
 CopyType
diff --git a/rocclr/runtime/device/gpu/gslbe/src/rt/GSLDevice.h b/rocclr/runtime/device/gpu/gslbe/src/rt/GSLDevice.h
index adeecc2006..41eda23143 100644
--- a/rocclr/runtime/device/gpu/gslbe/src/rt/GSLDevice.h
+++ b/rocclr/runtime/device/gpu/gslbe/src/rt/GSLDevice.h
@@ -97,14 +97,10 @@ public:
     const CALdeviceattribs& getAttribs() const { return m_attribs; }
     const gslMemInfo& getMemInfo() const { return m_memInfo; }
 
-    bool             isVmMode() const { return m_vmMode; };
-
     uint32           getVPUMask() const { return m_vpuMask; }
     bool             canDMA() const { return m_canDMA; }
     gslMemObject     m_srcDRMDMAMem, m_dstDRMDMAMem;    // memory object of flush buffer, used for DRMDMA flush
 
-    void             resCopy(gslMemObject srcRes, gslMemObject dstRes, uint32 flags) const;
-
     void             PerformAdapterInitialization() const;
     void             PerformFullInitialization() const;
 
@@ -211,7 +207,6 @@ private:
             uint    m_computeRing           : 1;
             uint    m_usePerVPUAdapterModel : 1;
             uint    m_PerformLazyDeviceInit : 1;
-            uint    m_vmMode                : 1;
             uint    m_isComputeRingIDForced : 1;
         };
     };
diff --git a/rocclr/runtime/device/hsa/hsasettings.cpp b/rocclr/runtime/device/hsa/hsasettings.cpp
index bef9ca26ae..5c5a2c2df1 100644
--- a/rocclr/runtime/device/hsa/hsasettings.cpp
+++ b/rocclr/runtime/device/hsa/hsasettings.cpp
@@ -34,7 +34,6 @@ Settings::Settings()
 bool
 Settings::create(bool doublePrecision)
 {
-    largeHostMemAlloc_ = true;
     customHostAllocator_ = true;
 
     // Enable extensions
diff --git a/rocclr/runtime/utils/flags.hpp b/rocclr/runtime/utils/flags.hpp
index fb803782d9..3e4db8c522 100644
--- a/rocclr/runtime/utils/flags.hpp
+++ b/rocclr/runtime/utils/flags.hpp
@@ -52,12 +52,8 @@ release(cstring, GPU_DEVICE_ORDINAL, "",                                      \
         "Select the device ordinal (comma seperated list of available devices)") \
 release(bool, REMOTE_ALLOC, false,                                            \
         "Use remote memory for the global heap allocation")                   \
-release(int, GPU_INITIAL_HEAP_SIZE, 16,                                       \
-        "Initial size of the GPU heap in MiB")                                \
 release(uint, GPU_MAX_HEAP_SIZE, 100,                                         \
         "Set maximum size of the GPU heap to % of board memory")              \
-release(int, GPU_HEAP_GROWTH_INCREMENT, 8,                                    \
-        "Amount to grow the GPU heap by in MiB")                              \
 release(uint, GPU_STAGING_BUFFER_SIZE, 512,                                   \
         "Size of the GPU staging buffer in KiB")                              \
 release(bool, GPU_DUMP_BLIT_KERNELS, false,                                   \