rocm-systems/util/simple_heap.h

////////////////////////////////////////////////////////////////////////////////
//
// The University of Illinois/NCSA
// Open Source License (NCSA)
//
// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved.
//
// Developed by:
//
//                 AMD Research and AMD HSA Software Development
//
//                 Advanced Micro Devices, Inc.
//
//                 www.amd.com
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to
// deal with the Software without restriction, including without limitation
// the rights to use, copy, modify, merge, publish, distribute, sublicense,
// and/or sell copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following conditions:
//
//  - Redistributions of source code must retain the above copyright notice,
//    this list of conditions and the following disclaimers.
//  - Redistributions in binary form must reproduce the above copyright
//    notice, this list of conditions and the following disclaimers in
//    the documentation and/or other materials provided with the distribution.
//  - Neither the names of Advanced Micro Devices, Inc,
//    nor the names of its contributors may be used to endorse or promote
//    products derived from this Software without specific prior written
//    permission.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
// DEALINGS WITH THE SOFTWARE.
//
////////////////////////////////////////////////////////////////////////////////

// A simple best fit memory allocator with eager compaction.  Manages block sub-allocation.
// For use when memory efficiency is more important than allocation speed.
// O(log n) time.

#ifndef HSA_RUNTME_CORE_UTIL_SIMPLE_HEAP_H_
#define HSA_RUNTME_CORE_UTIL_SIMPLE_HEAP_H_

#include <map>
#include <deque>
#include <utility>

#include "core/util/utils.h"

namespace wsl {

template <typename Allocator> class SimpleHeap {
 private:
  struct Fragment_T {
    typedef std::multimap<size_t, uintptr_t>::iterator ptr_t;
    ptr_t free_list_entry_;
    struct {
      size_t size : 62;
      bool discard : 1;
      bool free : 1;
    };

    Fragment_T(ptr_t Iterator, size_t Len, bool Free)
        : free_list_entry_(Iterator), size(Len), discard(false), free(Free) {}
    Fragment_T() = default;
  };

  struct Block {
    uintptr_t base_ptr_;
    size_t length_;

    Block(uintptr_t base, size_t length) : base_ptr_(base), length_(length) {}
    Block() = default;
  };

  Allocator block_allocator_;

  std::multimap<size_t, uintptr_t> free_list_;
  std::map<uintptr_t, std::map<uintptr_t, Fragment_T>> block_list_;
  std::deque<Block> block_cache_;

  // Size of blocks that are at least partially in use.
  size_t in_use_size_;
  // Total size of block cache
  size_t cache_size_;

  __forceinline bool isFree(const Fragment_T& node) { return node.free; }
  __forceinline void setUsed(Fragment_T& node) {
    node.free = false;
    node.free_list_entry_ = free_list_.end();
  }
  __forceinline void setFree(Fragment_T& node, typename Fragment_T::ptr_t Iterator) {
    node.free_list_entry_ = Iterator;
    node.free = true;
  }
  __forceinline Fragment_T makeFragment(size_t Len) {
    return Fragment_T(free_list_.end(), Len, false);
  }
  __forceinline Fragment_T makeFragment(typename Fragment_T::ptr_t Iterator, size_t Len) {
    return Fragment_T(Iterator, Len, true);
  }
  __forceinline void removeFreeListEntry(Fragment_T& node) {
    if (node.free_list_entry_ != free_list_.end()) {
      free_list_.erase(node.free_list_entry_);
      node.free_list_entry_ = free_list_.end();
    }
  }
  __forceinline void discard(Fragment_T& node) {
    removeFreeListEntry(node);
    node.discard = true;
  }

 public:
  explicit SimpleHeap(const Allocator& BlockAllocator = Allocator())
      : block_allocator_(BlockAllocator), in_use_size_(0), cache_size_(0) {}
  ~SimpleHeap() {
    trim();
    // Leak here may be due to the user.  Check is for debugging only.
    // assert(in_use_size_ == 0 && "Leak in SimpleHeap.");
  }

  SimpleHeap(const SimpleHeap& rhs) = delete;
  SimpleHeap(SimpleHeap&& rhs) = delete;
  SimpleHeap& operator=(const SimpleHeap& rhs) = delete;
  SimpleHeap& operator=(SimpleHeap&& rhs) = delete;

  void* alloc(size_t bytes) {
    // Find best fit.
    uintptr_t base;
    size_t size;
    // For bytes >= 2MB, the requested mem should be aligned
    size_t align_bytes = bytes;
    const int retry = bytes >= GPU_HUGE_PAGE_SIZE ? 1 : 0;
    size_t align = bytes >= GPU_HUGE_PAGE_SIZE ? GPU_HUGE_PAGE_SIZE : DEFAULT_GPU_PAGE_SIZE;

    for (int i = 0; i <= retry; i++) {
      auto free_fragment = free_list_.lower_bound(align_bytes);
      if (free_fragment == free_list_.end()) break;

      uintptr_t addr = free_fragment->second;
      size = free_fragment->first;

      assert(size >= bytes && "SimpleHeap: map lower_bound failure.");

      // Find the containing block and fragment
      auto it = block_list_.upper_bound(addr);
      it--;
      auto& frag_map = it->second;
      const auto& fragment = frag_map.find(addr);

      assert(fragment != frag_map.end() && "Inconsistency in SimpleHeap.");
      assert(size == fragment->second.size && "Inconsistency in SimpleHeap.");

      size_t delta = addr & (align - 1);
      if (!delta) {
        // already find aligned address
        base = addr;
        free_list_.erase(free_fragment);
        // Sub-allocate from fragment.
        fragment->second.size = bytes;
        setUsed(fragment->second);
        // Record remaining free space.
        if (size > bytes) {
          free_fragment = free_list_.insert(std::make_pair(size - bytes, base + bytes));
          frag_map[base + bytes] = makeFragment(free_fragment, size - bytes);
        }
      } else {
        // If this is the first request and the requested size is not enough for alignment,
        // then request for a bigger hole and do trim.
        if (i == 0 && size < bytes + align - delta) {
          align_bytes += align;
          continue;
        }

        uintptr_t aligned_base = addr + align - delta;
        base = aligned_base;

        // Erase the old free list
        free_list_.erase(free_fragment);

        // fragment 1 - free
        free_fragment = free_list_.insert(std::make_pair(aligned_base - addr, addr));
        frag_map[addr] = makeFragment(free_fragment, aligned_base - addr);

        //fragment 2 - used
        frag_map[base] = makeFragment(bytes);

        // fragement 3 - free
        if (size > aligned_base - addr + bytes) {
          free_fragment = free_list_.insert(std::make_pair(size - (aligned_base - addr) - bytes, aligned_base + bytes));
          frag_map[aligned_base + bytes] = makeFragment(free_fragment, size - (aligned_base - addr) - bytes);
        }
      }
      return reinterpret_cast<void*>(base);
    }

    // No usable fragment, check block cache
    if (bytes < default_block_size() && !block_cache_.empty()) {
      const auto& block = block_cache_.back();
      base = block.base_ptr_;
      size = block.length_;
      block_cache_.pop_back();
      cache_size_ -= size;
    } else {  // Alloc new block - new block may be larger than default.
      void* ptr = block_allocator_.alloc(bytes, size);
      base = reinterpret_cast<uintptr_t>(ptr);
      assert(ptr != nullptr && "Block allocation failed, Allocator is expected to throw.");
    }

    in_use_size_ += size;
    assert(size >= bytes && "Alloc exceeds block size.");
    // Sub alloc and insert free region.
    if (size > bytes) {
      auto free_fragment = free_list_.insert(std::make_pair(size - bytes, base + bytes));
      block_list_[base][base + bytes] = makeFragment(free_fragment, size - bytes);
    }
    // Track used region
    block_list_[base][base] = makeFragment(bytes);

    // Disallow multiple suballocation from large blocks.
    // Prevents a small allocation from retaining a large block.
    if (bytes > default_block_size()) {
      bool err = discardBlock(reinterpret_cast<void*>(base));
      assert(err && "Large block discard failed.");
    }

    return reinterpret_cast<void*>(base);
  }

  bool free(void* ptr) {
    if (ptr == nullptr) return true;

    uintptr_t base = reinterpret_cast<uintptr_t>(ptr);

    // Find fragment and validate.
    auto frag_map_it = block_list_.upper_bound(base);
    if (frag_map_it == block_list_.begin()) return false;
    frag_map_it--;
    auto& frag_map = frag_map_it->second;
    auto fragment = frag_map.find(base);
    if (fragment == frag_map.end() || isFree(fragment->second)) return false;

    bool discard = fragment->second.discard;

    // Merge lower
    if (fragment != frag_map.begin()) {
      auto lower = fragment;
      lower--;
      if (isFree(lower->second)) {
        removeFreeListEntry(lower->second);
        lower->second.size += fragment->second.size;
        frag_map.erase(fragment);
        fragment = lower;
      }
    }

    // Merge upper
    {
      auto upper = fragment;
      upper++;
      if ((upper != frag_map.end()) && isFree(upper->second)) {
        removeFreeListEntry(upper->second);
        fragment->second.size += upper->second.size;
        frag_map.erase(upper);
      }
    }

    // Release whole free blocks.
    if (frag_map.size() == 1) {
      Block block(fragment->first, fragment->second.size);
      block_list_.erase(frag_map_it);

      // Discard or add to the block cache.
      if (discard) {
        block_allocator_.free(reinterpret_cast<void*>(block.base_ptr_), block.length_);
      } else {
        block_cache_.push_back(block);
        cache_size_ += block.length_;
        in_use_size_ -= block.length_;
      }

      balance();

      // Don't publish free space since block was moved to the cache.
      return true;
    }

    // Don't report free memory if discarding the fragment.
    if (discard) return true;

    // Report free fragment
    const auto& freeEntry =
        free_list_.insert(std::make_pair(size_t(fragment->second.size), fragment->first));
    setFree(fragment->second, freeEntry);

    return true;
  }

  void balance() {
    // Release old blocks when over cache limit.
    while ((block_cache_.size() > 1) && (cache_size_ > in_use_size_ * 2)) {
      const auto& block = block_cache_.front();
      block_allocator_.free(reinterpret_cast<void*>(block.base_ptr_), block.length_);
      cache_size_ -= block.length_;
      block_cache_.pop_front();
    }
  }

  void trim() {
    for (const auto& block : block_cache_)
      block_allocator_.free(reinterpret_cast<void*>(block.base_ptr_), block.length_);
    block_cache_.clear();
    cache_size_ = 0;
  }

  size_t cache_size() const { return cache_size_; }

  size_t default_block_size() const { return block_allocator_.block_size(); }

  // Prevent reuse of the block containing ptr.  No further fragments will be allocated from the
  // block and the block will not be added to the block cache when it is free.
  bool discardBlock(void* ptr) {
    if (ptr == nullptr) return true;

    uintptr_t base = reinterpret_cast<uintptr_t>(ptr);

    // Find block validate.
    auto frag_map_it = block_list_.upper_bound(base);
    if (frag_map_it == block_list_.begin()) return false;
    frag_map_it--;
    auto& frag_map = frag_map_it->second;
    if ((base < frag_map.begin()->first) ||
        (frag_map.rbegin()->first + frag_map.rbegin()->second.size <= base))
      return false;

    // Is block already discarded?
    if (frag_map.begin()->second.discard) return true;

    // Mark all fragments for discard and compute block size.  Removes freelist records for all
    // fragments in the block.
    size_t size = 0;
    for (auto& frag : frag_map) {
      discard(frag.second);
      size += frag.second.size;
    }

    // Remove discarded block from in-use tracking and rebalance the block cache.
    in_use_size_ -= size;
    balance();

    return true;
  }
};

} // namespace wsl

#endif  // HSA_RUNTME_CORE_UTIL_SIMPLE_HEAP_H_