From 117be0b55a6e2bb3fa5a71dce5e96514cf1beeba Mon Sep 17 00:00:00 2001
From: Sean Keely <Sean.Keely@amd.com>
Date: Mon, 11 Sep 2017 20:31:20 -0500
Subject: [PATCH] Add suballocator for ordinary VRAM allocations smaller than
 2MB.

Track pointer info for sub 2MB fragment allocations in allocation_map_.

Add fragment support to IPC.

Change-Id: I00cfc2e2fa289aac90a4718c392f9bb056a61a87
---
 .../hsa-runtime/core/inc/amd_memory_region.h  |  15 ++
 runtime/hsa-runtime/core/inc/memory_region.h  |   1 +
 runtime/hsa-runtime/core/inc/runtime.h        |  13 +-
 .../core/runtime/amd_memory_region.cpp        |  33 ++-
 runtime/hsa-runtime/core/runtime/runtime.cpp  | 175 ++++++++++---
 runtime/hsa-runtime/core/util/locks.h         |   9 +-
 runtime/hsa-runtime/core/util/simple_heap.h   | 247 ++++++++++++++++++
 runtime/hsa-runtime/inc/hsa_ext_amd.h         |   2 +-
 8 files changed, 448 insertions(+), 47 deletions(-)
 create mode 100644 runtime/hsa-runtime/core/util/simple_heap.h
diff --git a/runtime/hsa-runtime/core/inc/amd_memory_region.h b/runtime/hsa-runtime/core/inc/amd_memory_region.h
index d2321dfd4e..f411c05c2f 100644
--- a/runtime/hsa-runtime/core/inc/amd_memory_region.h
+++ b/runtime/hsa-runtime/core/inc/amd_memory_region.h
@@ -49,6 +49,7 @@
 
 #include "core/inc/agent.h"
 #include "core/inc/memory_region.h"
+#include "core/util/simple_heap.h"
 
 #include "inc/hsa_ext_amd.h"
 
@@ -181,7 +182,21 @@ class MemoryRegion : public core::MemoryRegion {
   HSAuint64 virtual_size_;
 
   static const size_t kPageSize_ = 4096;
+
+  class BlockAllocator {
+   private:
+    MemoryRegion& region_;
+    static const size_t block_size_ = 2 * 1024 * 1024;  // 2MB blocks.
+   public:
+    explicit BlockAllocator(MemoryRegion& region) : region_(region) {}
+    void* alloc(size_t request_size, size_t& allocated_size) const;
+    void free(void* ptr, size_t length) const { region_.Free(ptr, length); }
+    size_t block_size() const { return block_size_; }
+  };
+
+  mutable SimpleHeap<BlockAllocator> fragment_allocator_;
 };
+
 }  // namespace
 
 #endif  // header guard
diff --git a/runtime/hsa-runtime/core/inc/memory_region.h b/runtime/hsa-runtime/core/inc/memory_region.h
index 502ebb38d4..bea4250086 100644
--- a/runtime/hsa-runtime/core/inc/memory_region.h
+++ b/runtime/hsa-runtime/core/inc/memory_region.h
@@ -85,6 +85,7 @@ class MemoryRegion : public Checked<0x9C961F19EE175BB3> {
     AllocateRestrict = (1 << 0),    // Don't map system memory to GPU agents
     AllocateExecutable = (1 << 1),  // Set executable permission
     AllocateDoubleMap = (1 << 2),   // Map twice VA allocation to backing store
+    AllocateDirect = (1 << 3),      // Bypass fragment cache.
   };
 
   typedef uint32_t AllocateFlags;
diff --git a/runtime/hsa-runtime/core/inc/runtime.h b/runtime/hsa-runtime/core/inc/runtime.h
index 06eff531ab..1a18149db3 100644
--- a/runtime/hsa-runtime/core/inc/runtime.h
+++ b/runtime/hsa-runtime/core/inc/runtime.h
@@ -264,8 +264,14 @@ class Runtime {
 
   hsa_status_t InteropUnmap(void* ptr);
 
+  struct PtrInfoBlockData {
+    void* base;
+    size_t length;
+  };
+
   hsa_status_t PtrInfo(void* ptr, hsa_amd_pointer_info_t* info, void* (*alloc)(size_t),
-                       uint32_t* num_agents_accessible, hsa_agent_t** accessible);
+                       uint32_t* num_agents_accessible, hsa_agent_t** accessible,
+                       PtrInfoBlockData* block_info = nullptr);
 
   hsa_status_t SetPtrInfoData(void* ptr, void* userptr);
 
@@ -315,12 +321,13 @@ class Runtime {
   static void AsyncEventsLoop(void*);
 
   struct AllocationRegion {
-    AllocationRegion() : region(NULL), size(0) {}
+    AllocationRegion() : region(NULL), size(0), user_ptr(nullptr) {}
     AllocationRegion(const MemoryRegion* region_arg, size_t size_arg)
-        : region(region_arg), size(size_arg) {}
+        : region(region_arg), size(size_arg), user_ptr(nullptr) {}
 
     const MemoryRegion* region;
     size_t size;
+    void* user_ptr;
   };
 
   struct AsyncEventsControl {
diff --git a/runtime/hsa-runtime/core/runtime/amd_memory_region.cpp b/runtime/hsa-runtime/core/runtime/amd_memory_region.cpp
index 00642f3de0..1e44e14236 100644
--- a/runtime/hsa-runtime/core/runtime/amd_memory_region.cpp
+++ b/runtime/hsa-runtime/core/runtime/amd_memory_region.cpp
@@ -49,6 +49,7 @@
 #include "core/inc/amd_cpu_agent.h"
 #include "core/inc/amd_gpu_agent.h"
 #include "core/util/utils.h"
+#include "core/inc/exceptions.h"
 
 namespace amd {
 void* MemoryRegion::AllocateKfdMemory(const HsaMemFlags& flag,
@@ -98,13 +99,13 @@ void MemoryRegion::MakeKfdMemoryUnresident(const void* ptr) {
   hsaKmtUnmapMemoryToGPU(const_cast<void*>(ptr));
 }
 
-MemoryRegion::MemoryRegion(bool fine_grain, bool full_profile,
-                           core::Agent* owner,
+MemoryRegion::MemoryRegion(bool fine_grain, bool full_profile, core::Agent* owner,
                            const HsaMemoryProperties& mem_props)
     : core::MemoryRegion(fine_grain, full_profile, owner),
       mem_props_(mem_props),
       max_single_alloc_size_(0),
-      virtual_size_(0) {
+      virtual_size_(0),
+      fragment_allocator_(BlockAllocator(*this)) {
   virtual_size_ = GetPhysicalSize();
 
   mem_flag_.Value = 0;
@@ -169,6 +170,15 @@ hsa_status_t MemoryRegion::Allocate(size_t size, AllocateFlags alloc_flags,
   kmt_alloc_flags.ui32.AQLQueueMemory =
       (alloc_flags & AllocateDoubleMap ? 1 : 0);
 
+  // Only allow using the suballocator for ordinary VRAM.
+  bool useSubAlloc = IsLocalMemory();
+  useSubAlloc &= (alloc_flags == AllocateRestrict);
+  useSubAlloc &= (size <= fragment_allocator_.max_alloc());
+  if (useSubAlloc) {
+    *address = fragment_allocator_.alloc(size);
+    return HSA_STATUS_SUCCESS;
+  }
+
   *address = AllocateKfdMemory(kmt_alloc_flags, owner()->node_id(), size);
 
   if (*address != NULL) {
@@ -220,6 +230,8 @@ hsa_status_t MemoryRegion::Allocate(size_t size, AllocateFlags alloc_flags,
 }
 
 hsa_status_t MemoryRegion::Free(void* address, size_t size) const {
+  if (fragment_allocator_.free(address)) return HSA_STATUS_SUCCESS;
+
   MakeKfdMemoryUnresident(address);
 
   FreeKfdMemory(address, size);
@@ -586,4 +598,19 @@ hsa_status_t MemoryRegion::AssignAgent(void* ptr, size_t size,
   return HSA_STATUS_SUCCESS;
 }
 
+void* MemoryRegion::BlockAllocator::alloc(size_t request_size, size_t& allocated_size) const {
+  assert(request_size < block_size() && "BlockAllocator alloc request exceeds block size.");
+
+  void* ret;
+  hsa_status_t err = region_.Allocate(
+      block_size(), core::MemoryRegion::AllocateRestrict | core::MemoryRegion::AllocateDirect,
+      &ret);
+  if (err != HSA_STATUS_SUCCESS)
+    throw new ::AMD::hsa_exception(err, "MemoryRegion::BlockAllocator::alloc failed.");
+  assert(ret != nullptr && "Region returned nullptr on success.");
+
+  allocated_size = block_size();
+  return ret;
+}
+
 }  // namespace
diff --git a/runtime/hsa-runtime/core/runtime/runtime.cpp b/runtime/hsa-runtime/core/runtime/runtime.cpp
index 1913e179e7..9f3bebca37 100644
--- a/runtime/hsa-runtime/core/runtime/runtime.cpp
+++ b/runtime/hsa-runtime/core/runtime/runtime.cpp
@@ -321,11 +321,11 @@ hsa_status_t Runtime::AllocateMemory(const MemoryRegion* region, size_t size,
 }
 
 hsa_status_t Runtime::FreeMemory(void* ptr) {
-  if (ptr == NULL) {
+  if (ptr == nullptr) {
     return HSA_STATUS_SUCCESS;
   }
 
-  const MemoryRegion* region = NULL;
+  const MemoryRegion* region = nullptr;
   size_t size = 0;
   ScopedAcquire<KernelMutex> lock(&memory_lock_);
 
@@ -333,11 +333,14 @@ hsa_status_t Runtime::FreeMemory(void* ptr) {
 
   if (it == allocation_map_.end()) {
     assert(false && "Can't find address in allocation map");
-    return HSA_STATUS_ERROR;
+    return HSA_STATUS_ERROR_INVALID_ARGUMENT;
   }
   region = it->second.region;
   size = it->second.size;
 
+  // Imported fragments can't be released with FreeMemory.
+  if (region == nullptr) return HSA_STATUS_ERROR_INVALID_ARGUMENT;
+
   allocation_map_.erase(it);
 
   return region->Free(ptr, size);
@@ -681,7 +684,17 @@ hsa_status_t Runtime::InteropUnmap(void* ptr) {
 }
 
 hsa_status_t Runtime::PtrInfo(void* ptr, hsa_amd_pointer_info_t* info, void* (*alloc)(size_t),
-                              uint32_t* num_agents_accessible, hsa_agent_t** accessible) {
+                              uint32_t* num_agents_accessible, hsa_agent_t** accessible,
+                              PtrInfoBlockData* block_info) {
+  static_assert(static_cast<int>(HSA_POINTER_UNKNOWN) == static_cast<int>(HSA_EXT_POINTER_TYPE_UNKNOWN),
+                "Thunk pointer info mismatch");
+  static_assert(static_cast<int>(HSA_POINTER_ALLOCATED) == static_cast<int>(HSA_EXT_POINTER_TYPE_HSA),
+                "Thunk pointer info mismatch");
+  static_assert(static_cast<int>(HSA_POINTER_REGISTERED_USER) == static_cast<int>(HSA_EXT_POINTER_TYPE_LOCKED),
+                "Thunk pointer info mismatch");
+  static_assert(static_cast<int>(HSA_POINTER_REGISTERED_GRAPHICS) == static_cast<int>(HSA_EXT_POINTER_TYPE_GRAPHICS),
+                "Thunk pointer info mismatch");
+
   HsaPointerInfo thunkInfo;
   uint32_t* mappedNodes;
 
@@ -692,36 +705,50 @@ hsa_status_t Runtime::PtrInfo(void* ptr, hsa_amd_pointer_info_t* info, void* (*a
 
   bool returnListData =
       ((alloc != nullptr) && (num_agents_accessible != nullptr) && (accessible != nullptr));
-  if (returnListData) {
-    size_t max_agents = cpu_agents_.size() + gpu_agents_.size();
-    mappedNodes = (uint32_t*)alloca(max_agents * sizeof(uint32_t));
-    // memory_lock protects access to the NMappedNodes array since this changes with calls to memory
-    // APIs.
+
+  {  // memory_lock protects access to the NMappedNodes array and fragment user data since these may
+     // change with calls to memory APIs.
     ScopedAcquire<KernelMutex> lock(&memory_lock_);
     hsaKmtQueryPointerInfo(ptr, &thunkInfo);
-    assert(thunkInfo.NMappedNodes <= max_agents &&
-           "PointerInfo: Thunk returned more than all agents in NMappedNodes.");
-    memcpy(mappedNodes, thunkInfo.MappedNodes, thunkInfo.NMappedNodes * sizeof(uint32_t));
-  } else {
-    hsaKmtQueryPointerInfo(ptr, &thunkInfo);
-  }
-
-  static_assert((int)HSA_POINTER_UNKNOWN == (int)HSA_EXT_POINTER_TYPE_UNKNOWN,
-                "Thunk pointer info mismatch");
-  static_assert((int)HSA_POINTER_ALLOCATED == (int)HSA_EXT_POINTER_TYPE_HSA,
-                "Thunk pointer info mismatch");
-  static_assert((int)HSA_POINTER_REGISTERED_USER == (int)HSA_EXT_POINTER_TYPE_LOCKED,
-                "Thunk pointer info mismatch");
-  static_assert((int)HSA_POINTER_REGISTERED_GRAPHICS == (int)HSA_EXT_POINTER_TYPE_GRAPHICS,
-                "Thunk pointer info mismatch");
+    if (returnListData) {
+      assert(thunkInfo.NMappedNodes <= agents_by_node_.size() &&
+             "PointerInfo: Thunk returned more than all agents in NMappedNodes.");
+      mappedNodes = (uint32_t*)alloca(thunkInfo.NMappedNodes * sizeof(uint32_t));
+      memcpy(mappedNodes, thunkInfo.MappedNodes, thunkInfo.NMappedNodes * sizeof(uint32_t));
+    }
+    retInfo.type = (hsa_amd_pointer_type_t)thunkInfo.Type;
+    retInfo.agentBaseAddress = reinterpret_cast<void*>(thunkInfo.GPUAddress);
+    retInfo.hostBaseAddress = thunkInfo.CPUAddress;
+    retInfo.sizeInBytes = thunkInfo.SizeInBytes;
+    retInfo.userData = thunkInfo.UserData;
+    if (block_info != nullptr) {
+      block_info->base = retInfo.hostBaseAddress;
+      block_info->length = retInfo.sizeInBytes;
+    }
+    if (retInfo.type == HSA_EXT_POINTER_TYPE_HSA) {
+      auto fragment = allocation_map_.upper_bound(ptr);
+      if (fragment != allocation_map_.begin()) {
+        fragment--;
+        if ((fragment->first <= ptr) &&
+            (ptr < reinterpret_cast<const uint8_t*>(fragment->first) + fragment->second.size)) {
+          retInfo.hostBaseAddress = const_cast<void*>(fragment->first);
+          retInfo.agentBaseAddress = retInfo.hostBaseAddress;
+          retInfo.sizeInBytes = fragment->second.size;
+          retInfo.userData = fragment->second.user_ptr;
+        }
+      }
+    }
+  }  // end lock scope
 
   retInfo.size = Min(info->size, sizeof(hsa_amd_pointer_info_t));
-  retInfo.type = (hsa_amd_pointer_type_t)thunkInfo.Type;
-  retInfo.agentBaseAddress = reinterpret_cast<void*>(thunkInfo.GPUAddress);
-  retInfo.hostBaseAddress = thunkInfo.CPUAddress;
-  retInfo.sizeInBytes = thunkInfo.SizeInBytes;
-  retInfo.userData = thunkInfo.UserData;
-  retInfo.agentOwner = agents_by_node_[thunkInfo.Node][0]->public_handle();
+
+  // Temp: workaround thunk bug, IPC memory has garbage in Node.
+  // retInfo.agentOwner = agents_by_node_[thunkInfo.Node][0]->public_handle();
+  auto it = agents_by_node_.find(thunkInfo.Node);
+  if (it != agents_by_node_.end())
+    retInfo.agentOwner = agents_by_node_[thunkInfo.Node][0]->public_handle();
+  else
+    retInfo.agentOwner.handle = 0;
 
   memcpy(info, &retInfo, retInfo.size);
 
@@ -751,19 +778,50 @@ hsa_status_t Runtime::PtrInfo(void* ptr, hsa_amd_pointer_info_t* info, void* (*a
 }
 
 hsa_status_t Runtime::SetPtrInfoData(void* ptr, void* userptr) {
+  {  // Use allocation map if possible to handle fragments.
+    ScopedAcquire<KernelMutex> lock(&memory_lock_);
+    const auto& it = allocation_map_.find(ptr);
+    if (it != allocation_map_.end()) {
+      it->second.user_ptr = userptr;
+      return HSA_STATUS_SUCCESS;
+    }
+  }
+  // Cover entries not in the allocation map (graphics, lock,...)
   if (hsaKmtSetMemoryUserData(ptr, userptr) == HSAKMT_STATUS_SUCCESS)
     return HSA_STATUS_SUCCESS;
-  else
-    return HSA_STATUS_ERROR_INVALID_ARGUMENT;
+  return HSA_STATUS_ERROR_INVALID_ARGUMENT;
 }
 
 hsa_status_t Runtime::IPCCreate(void* ptr, size_t len, hsa_amd_ipc_memory_t* handle) {
   static_assert(sizeof(hsa_amd_ipc_memory_t) == sizeof(HsaSharedMemoryHandle),
                 "Thunk IPC mismatch.");
-  if (hsaKmtShareMemory(ptr, len, (HsaSharedMemoryHandle*)handle) == HSAKMT_STATUS_SUCCESS)
-    return HSA_STATUS_SUCCESS;
-  else
+  // Reject sharing allocations larger than ~8TB due to thunk limitations.
+  if (len > 0x7FFFFFFF000ull) return HSA_STATUS_ERROR_INVALID_ARGUMENT;
+
+  // Check for fragment sharing.
+  PtrInfoBlockData block;
+  hsa_amd_pointer_info_t info;
+  info.size = sizeof(info);
+  if (PtrInfo(ptr, &info, nullptr, nullptr, nullptr, &block) != HSA_STATUS_SUCCESS)
     return HSA_STATUS_ERROR_INVALID_ARGUMENT;
+  if ((block.base != ptr) || (block.length != len)) {
+    if (!IsMultipleOf(block.base, 2 * 1024 * 1024)) {
+      assert(false && "Fragment's block not aligned to 2MB!");
+      return HSA_STATUS_ERROR_INVALID_ARGUMENT;
+    }
+    if (hsaKmtShareMemory(block.base, block.length, reinterpret_cast<HsaSharedMemoryHandle*>(
+                                                        handle)) != HSAKMT_STATUS_SUCCESS)
+      return HSA_STATUS_ERROR_INVALID_ARGUMENT;
+    uint32_t offset =
+        (reinterpret_cast<uint8_t*>(ptr) - reinterpret_cast<uint8_t*>(block.base)) / 4096;
+    // Holds size in (4K?) pages in thunk handle: Mark as a fragment and denote offset.
+    handle->handle[6] |= 0x80000000 | offset;
+  } else {
+    if (hsaKmtShareMemory(ptr, len, reinterpret_cast<HsaSharedMemoryHandle*>(handle)) !=
+        HSAKMT_STATUS_SUCCESS)
+      return HSA_STATUS_ERROR_INVALID_ARGUMENT;
+  }
+  return HSA_STATUS_SUCCESS;
 }
 
 hsa_status_t Runtime::IPCAttach(const hsa_amd_ipc_memory_t* handle, size_t len, uint32_t num_agents,
@@ -772,14 +830,36 @@ hsa_status_t Runtime::IPCAttach(const hsa_amd_ipc_memory_t* handle, size_t len,
   void* importAddress;
   HSAuint64 importSize;
   HSAuint64 altAddress;
+
+  hsa_amd_ipc_memory_t importHandle;
+  importHandle = *handle;
+
+  // Extract fragment info
+  bool isFragment = false;
+  uint32_t fragOffset = 0;
+  auto fixFragment = [&]() {
+    if (!isFragment) return;
+    importAddress = reinterpret_cast<uint8_t*>(importAddress) + fragOffset;
+    len = Min(len, importSize - fragOffset);
+    ScopedAcquire<KernelMutex> lock(&memory_lock_);
+    allocation_map_[importAddress] = AllocationRegion(nullptr, len);
+  };
+
+  if ((importHandle.handle[6] & 0x80000000) != 0) {
+    isFragment = true;
+    fragOffset = (importHandle.handle[6] & 0x1FF) * 4096;
+    importHandle.handle[6] &= ~(0x80000000 | 0x1FF);
+  }
+
   if (num_agents == 0) {
-    if (hsaKmtRegisterSharedHandle(reinterpret_cast<const HsaSharedMemoryHandle*>(handle),
+    if (hsaKmtRegisterSharedHandle(reinterpret_cast<const HsaSharedMemoryHandle*>(&importHandle),
                                    &importAddress, &importSize) != HSAKMT_STATUS_SUCCESS)
       return HSA_STATUS_ERROR_INVALID_ARGUMENT;
     if (hsaKmtMapMemoryToGPU(importAddress, importSize, &altAddress) != HSAKMT_STATUS_SUCCESS) {
       hsaKmtDeregisterMemory(importAddress);
       return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
     }
+    fixFragment();
     *mapped_ptr = importAddress;
     return HSA_STATUS_SUCCESS;
   }
@@ -798,9 +878,9 @@ hsa_status_t Runtime::IPCAttach(const hsa_amd_ipc_memory_t* handle, size_t len,
   for (int i = 0; i < num_agents; i++)
     agents[i]->GetInfo((hsa_agent_info_t)HSA_AMD_AGENT_INFO_DRIVER_NODE_ID, &nodes[i]);
 
-  if (hsaKmtRegisterSharedHandleToNodes(reinterpret_cast<const HsaSharedMemoryHandle*>(handle),
-                                        &importAddress, &importSize, num_agents,
-                                        nodes) != HSAKMT_STATUS_SUCCESS)
+  if (hsaKmtRegisterSharedHandleToNodes(
+          reinterpret_cast<const HsaSharedMemoryHandle*>(&importHandle), &importAddress,
+          &importSize, num_agents, nodes) != HSAKMT_STATUS_SUCCESS)
     return HSA_STATUS_ERROR_INVALID_ARGUMENT;
 
   HsaMemMapFlags map_flags;
@@ -816,11 +896,28 @@ hsa_status_t Runtime::IPCAttach(const hsa_amd_ipc_memory_t* handle, size_t len,
     }
   }
 
+  fixFragment();
   *mapped_ptr = importAddress;
   return HSA_STATUS_SUCCESS;
 }
 
 hsa_status_t Runtime::IPCDetach(void* ptr) {
+  {  // Handle imported fragments.
+    ScopedAcquire<KernelMutex> lock(&memory_lock_);
+    const auto& it = allocation_map_.find(ptr);
+    if (it != allocation_map_.end()) {
+      if (it->second.region != nullptr) return HSA_STATUS_ERROR_INVALID_ARGUMENT;
+      allocation_map_.erase(it);
+      lock.Release();  // Can't hold memory lock when using pointer info.
+
+      PtrInfoBlockData block;
+      hsa_amd_pointer_info_t info;
+      info.size = sizeof(info);
+      if (PtrInfo(ptr, &info, nullptr, nullptr, nullptr, &block) != HSA_STATUS_SUCCESS)
+        return HSA_STATUS_ERROR_INVALID_ARGUMENT;
+      ptr = block.base;
+    }
+  }
   if (hsaKmtUnmapMemoryToGPU(ptr) != HSAKMT_STATUS_SUCCESS)
     return HSA_STATUS_ERROR_INVALID_ARGUMENT;
   if (hsaKmtDeregisterMemory(ptr) != HSAKMT_STATUS_SUCCESS)
diff --git a/runtime/hsa-runtime/core/util/locks.h b/runtime/hsa-runtime/core/util/locks.h
index bab52f91e0..c9ff9ee7b2 100644
--- a/runtime/hsa-runtime/core/util/locks.h
+++ b/runtime/hsa-runtime/core/util/locks.h
@@ -59,7 +59,14 @@ class ScopedAcquire {
   explicit ScopedAcquire(LockType* lock) : lock_(lock) { lock_->Acquire(); }
 
   /// @brief: when destructing, release the lock.
-  ~ScopedAcquire() { lock_->Release(); }
+  ~ScopedAcquire() {
+    if (lock_ != nullptr) lock_->Release();
+  }
+
+  void Release() {
+    lock_->Release();
+    lock_ = nullptr;
+  }
 
  private:
   LockType* lock_;
diff --git a/runtime/hsa-runtime/core/util/simple_heap.h b/runtime/hsa-runtime/core/util/simple_heap.h
new file mode 100644
index 0000000000..76df7ad73d
--- /dev/null
+++ b/runtime/hsa-runtime/core/util/simple_heap.h
@@ -0,0 +1,247 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+//
+// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved.
+//
+// Developed by:
+//
+//                 AMD Research and AMD HSA Software Development
+//
+//                 Advanced Micro Devices, Inc.
+//
+//                 www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+// A simple best fit memory allocator with eager compaction.  Manages block sub-allocation.
+// For use when memory efficiency is more important than allocation speed.
+// O(log n) time.
+
+#ifndef HSA_RUNTME_CORE_UTIL_SIMPLE_HEAP_H_
+#define HSA_RUNTME_CORE_UTIL_SIMPLE_HEAP_H_
+
+#include <map>
+#include <deque>
+#include <utility>
+
+#include "core/util/utils.h"
+
+template <typename Allocator> class SimpleHeap {
+ private:
+  struct Fragment_T {
+    typedef std::multimap<size_t, uintptr_t>::iterator ptr_t;
+    ptr_t free_list_entry_;
+    size_t size;
+
+    Fragment_T(ptr_t Iterator, size_t Len) : free_list_entry_(Iterator), size(Len) {}
+    Fragment_T() = default;
+  };
+
+  struct Block {
+    uintptr_t base_ptr_;
+    size_t length_;
+
+    Block(uintptr_t base, size_t length) : base_ptr_(base), length_(length) {}
+    Block() = default;
+  };
+
+  Allocator block_allocator_;
+
+  std::multimap<size_t, uintptr_t> free_list_;
+  std::map<uintptr_t, std::map<uintptr_t, Fragment_T>> block_list_;
+  std::deque<Block> block_cache_;
+
+  size_t in_use_size_;
+  size_t cache_size_;
+
+  __forceinline bool isFree(const Fragment_T& node) {
+    return node.free_list_entry_ != free_list_.end();
+  }
+  __forceinline void setUsed(Fragment_T& node) { node.free_list_entry_ = free_list_.end(); }
+  __forceinline void setFree(Fragment_T& node, typename Fragment_T::ptr_t Iterator) {
+    node.free_list_entry_ = Iterator;
+  }
+  __forceinline Fragment_T makeFragment(size_t Len) { return Fragment_T(free_list_.end(), Len); }
+  __forceinline Fragment_T makeFragment(typename Fragment_T::ptr_t Iterator, size_t Len) {
+    return Fragment_T(Iterator, Len);
+  }
+
+ public:
+  explicit SimpleHeap(const Allocator& BlockAllocator = Allocator())
+      : block_allocator_(BlockAllocator), in_use_size_(0), cache_size_(0) {}
+  ~SimpleHeap() {
+    trim();
+    // Leak here may be due to the user.  Check is for debugging only.
+    // assert(in_use_size_ == 0 && "Leak in SimpleHeap.");
+  }
+
+  SimpleHeap(const SimpleHeap& rhs) = delete;
+  SimpleHeap(SimpleHeap&& rhs) = delete;
+  SimpleHeap& operator=(const SimpleHeap& rhs) = delete;
+  SimpleHeap& operator=(SimpleHeap&& rhs) = delete;
+
+  void* alloc(size_t bytes) {
+    if (bytes > max_alloc()) {
+      assert(false && "Requested allocation is larger than block size.");
+      throw std::bad_alloc();
+      return nullptr;
+    }
+
+    // Find best fit.
+    auto free_fragment = free_list_.lower_bound(bytes);
+    uintptr_t base;
+    size_t size;
+
+    if (free_fragment != free_list_.end()) {
+      base = free_fragment->second;
+      size = free_fragment->first;
+      free_list_.erase(free_fragment);
+
+      assert(size >= bytes && "SimpleHeap: map lower_bound failure.");
+
+      // Find the containing block and fragment
+      auto it = block_list_.upper_bound(base);
+      it--;
+      auto& frag_map = it->second;
+      const auto& fragment = frag_map.find(base);
+
+      assert(fragment != frag_map.end() && "Inconsistency in SimpleHeap.");
+      assert(size == fragment->second.size && "Inconsistency in SimpleHeap.");
+
+      // Sub-allocate from fragment.
+      fragment->second.size = bytes;
+      setUsed(fragment->second);
+      // Record remaining free space.
+      if (size > bytes) {
+        free_fragment = free_list_.insert(std::make_pair(size - bytes, base + bytes));
+        frag_map[base + bytes] = makeFragment(free_fragment, size - bytes);
+      }
+      return reinterpret_cast<void*>(base);
+    }
+
+    // No usable fragment, check block cache
+    if (!block_cache_.empty()) {
+      const auto& block = block_cache_.back();
+      base = block.base_ptr_;
+      size = block.length_;
+      block_cache_.pop_back();
+      cache_size_ -= size;
+    } else {  // Alloc new block
+      void* ptr = block_allocator_.alloc(bytes, size);
+      base = reinterpret_cast<uintptr_t>(ptr);
+      assert(ptr != nullptr && "Block allocation failed, Allocator is expected to throw.");
+    }
+
+    in_use_size_ += size;
+    assert(size >= bytes && "Alloc exceeds block size.");
+    // Sub alloc and insert free region.
+    if (size > bytes) {
+      free_fragment = free_list_.insert(std::make_pair(size - bytes, base + bytes));
+      block_list_[base][base + bytes] = makeFragment(free_fragment, size - bytes);
+    }
+    // Track used region
+    block_list_[base][base] = makeFragment(bytes);
+
+    return reinterpret_cast<void*>(base);
+  }
+
+  bool free(void* ptr) {
+    if (ptr == nullptr) return true;
+
+    uintptr_t base = reinterpret_cast<uintptr_t>(ptr);
+
+    // Find fragment and validate.
+    auto frag_map_it = block_list_.upper_bound(base);
+    if (frag_map_it == block_list_.begin()) return false;
+    frag_map_it--;
+    auto& frag_map = frag_map_it->second;
+    auto fragment = frag_map.find(base);
+    if (fragment == frag_map.end() || isFree(fragment->second)) return false;
+
+    // Merge lower
+    if (fragment != frag_map.begin()) {
+      auto lower = fragment;
+      lower--;
+      if (isFree(lower->second)) {
+        free_list_.erase(lower->second.free_list_entry_);
+        lower->second.size += fragment->second.size;
+        frag_map.erase(fragment);
+        fragment = lower;
+      }
+    }
+
+    // Merge upper
+    {
+      auto upper = fragment;
+      upper++;
+      if ((upper != frag_map.end()) && isFree(upper->second)) {
+        free_list_.erase(upper->second.free_list_entry_);
+        fragment->second.size += upper->second.size;
+        frag_map.erase(upper);
+      }
+    }
+
+    // Move whole free blocks to block cache
+    if (frag_map.size() == 1) {
+      in_use_size_ -= fragment->second.size;
+      cache_size_ += fragment->second.size;
+      block_cache_.push_back(Block(fragment->first, fragment->second.size));
+      block_list_.erase(frag_map_it);
+
+      // Release old blocks when over cache limit.
+      while ((block_cache_.size() > 1) && (cache_size_ > in_use_size_ * 2)) {
+        const auto& block = block_cache_.front();
+        block_allocator_.free(reinterpret_cast<void*>(block.base_ptr_), block.length_);
+        cache_size_ -= block.length_;
+        block_cache_.pop_front();
+      }
+
+      // Don't publish free space since block was moved to the cache.
+      return true;
+    }
+
+    // Report free fragment
+    const auto& freeEntry =
+        free_list_.insert(std::make_pair(fragment->second.size, fragment->first));
+    setFree(fragment->second, freeEntry);
+
+    return true;
+  }
+
+  void trim() {
+    for (const auto& block : block_cache_)
+      block_allocator_.free(reinterpret_cast<void*>(block.base_ptr_), block.length_);
+    block_cache_.clear();
+  }
+
+  size_t max_alloc() const { return block_allocator_.block_size(); }
+};
+
+#endif  // HSA_RUNTME_CORE_UTIL_SIMPLE_HEAP_H_
diff --git a/runtime/hsa-runtime/inc/hsa_ext_amd.h b/runtime/hsa-runtime/inc/hsa_ext_amd.h
index d4a1841240..905aa31851 100755
--- a/runtime/hsa-runtime/inc/hsa_ext_amd.h
+++ b/runtime/hsa-runtime/inc/hsa_ext_amd.h
@@ -1488,7 +1488,7 @@ typedef struct hsa_amd_ipc_memory_s {
  * any process.  In general applications should confirm that a shared memory
  * region has been attached (via hsa_amd_ipc_memory_attach) in the remote
  * process prior to releasing that memory in the local process.
- * Repeated calls for the same allocaiton may, but are not required to, return
+ * Repeated calls for the same allocation may, but are not required to, return
  * unique handles.
  *
  * @param[in] ptr Pointer to memory allocated via ROCr APIs to prepare for