rocm-systems/projects/clr/rocclr/platform/memory.cpp

/* Copyright (c) 2010 - 2022 Advanced Micro Devices, Inc.

 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:

 The above copyright notice and this permission notice shall be included in
 all copies or substantial portions of the Software.

 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE. */

#include "amdocl/cl_common.hpp"

#include "os/alloc.hpp"
#include "platform/context.hpp"
#include "platform/object.hpp"
#include "platform/memory.hpp"
#include "device/device.hpp"

#include <atomic>

// Stores the no. of memory allocations
std::atomic<uint32_t> numAllocs = {0};

namespace amd {

bool BufferRect::create(const size_t* bufferOrigin, const size_t* region, size_t bufferRowPitch,
                        size_t bufferSlicePitch) {
  bool valid = false;
  // Find the buffer's row pitch
  rowPitch_ = (bufferRowPitch != 0) ? bufferRowPitch : region[0];
  // Find the buffer's slice pitch
  slicePitch_ = (bufferSlicePitch != 0) ? bufferSlicePitch : rowPitch_ * region[1];
  // Find the region start offset
  start_ = bufferOrigin[2] * slicePitch_ + bufferOrigin[1] * rowPitch_ + bufferOrigin[0];
  // Find the region relative end offset
  end_ = (region[2] - 1) * slicePitch_ + (region[1] - 1) * rowPitch_ + region[0];
  // Make sure we have a valid region
  if ((rowPitch_ >= region[0]) && (slicePitch_ >= (region[1] * rowPitch_)) &&
      ((slicePitch_ % rowPitch_) == 0)) {
    valid = true;
  }
  return valid;
}

bool HostMemoryReference::allocateMemory(size_t size, const Context& context) {
  assert(!alloced_ && "Runtime should not reallocate system memory!");
  size_t memoryAlignment = (CPU_MEMORY_ALIGNMENT_SIZE <= 0) ? 256 : CPU_MEMORY_ALIGNMENT_SIZE;
  size_ = amd::alignUp(size, memoryAlignment);
  //! \note memory size must be aligned for CAL pinning
  hostMem_ = CPU_MEMORY_GUARD_PAGES
      ? GuardedMemory::allocate(size_, MEMOBJ_BASE_ADDR_ALIGN, CPU_MEMORY_GUARD_PAGE_SIZE * Ki)
      : context.hostAlloc(size_, MEMOBJ_BASE_ADDR_ALIGN);
  alloced_ = (hostMem_ != NULL);
  return alloced_;
}

// Frees system memory if it was allocated
void HostMemoryReference::deallocateMemory(const Context& context) {
  if (alloced_) {
    if (CPU_MEMORY_GUARD_PAGES)
      GuardedMemory::deallocate(hostMem_);
    else
      context.hostFree(hostMem_);
    size_ = 0;
    alloced_ = false;
    hostMem_ = NULL;
  }
}

Memory::Memory(Context& context, Type type, Flags flags, size_t size, void* svmPtr)
    : numDevices_(0),
      deviceMemories_(NULL),
      destructorCallbacks_(NULL),
      context_(context),
      parent_(NULL),
      type_(type),
      hostMemRef_(NULL),
      origin_(0),
      size_(size),
      flags_(flags),
      version_(0),
      lastWriter_(NULL),
      interopObj_(NULL),
      vDev_(NULL),
      mapCount_(0),
      svmHostAddress_(svmPtr),
      flagsEx_(0),
      lockMemoryOps_("Memory Ops Lock", true) {
  svmPtrCommited_ = (flags & CL_MEM_SVM_FINE_GRAIN_BUFFER) ? true : false;
  canBeCached_ = true;
}

Memory::Memory(Memory& parent, Flags flags, size_t origin, size_t size, Type type)
    : numDevices_(0),
      deviceMemories_(NULL),
      destructorCallbacks_(NULL),
      context_(parent.getContext()),
      parent_(&parent),
      type_((type == 0) ? parent.type_ : type),
      hostMemRef_(NULL),
      origin_(origin),
      size_(size),
      flags_(flags),
      version_(parent.getVersion()),
      lastWriter_(parent.getLastWriter()),
      interopObj_(parent.getInteropObj()),
      vDev_(NULL),
      mapCount_(0),
      svmHostAddress_(parent.getSvmPtr()),
      flagsEx_(0),
      lockMemoryOps_("Memory Ops Lock", true) {
  svmPtrCommited_ = parent.isSvmPtrCommited();
  canBeCached_ = true;
  parent_->retain();
  parent_->isParent_ = true;

  if (parent.getHostMem() != nullptr) {
    setHostMem(reinterpret_cast<address>(parent.getHostMem()) + origin);
  }

  if (parent.getSvmPtr() != nullptr) {
    setSvmPtr(reinterpret_cast<address>(parent.getSvmPtr()) + origin);
  }

  // Inherit memory flags from the parent
  if ((flags_ & (CL_MEM_READ_WRITE | CL_MEM_READ_ONLY | CL_MEM_WRITE_ONLY)) == 0) {
    flags_ |= parent_->getMemFlags() & (CL_MEM_READ_WRITE | CL_MEM_READ_ONLY | CL_MEM_WRITE_ONLY);
  }

  flags_ |=
      parent_->getMemFlags() & (CL_MEM_ALLOC_HOST_PTR | CL_MEM_COPY_HOST_PTR | CL_MEM_USE_HOST_PTR);

  if ((flags_ & (CL_MEM_HOST_READ_ONLY | CL_MEM_HOST_WRITE_ONLY | CL_MEM_HOST_NO_ACCESS)) == 0) {
    flags_ |= parent_->getMemFlags() &
        (CL_MEM_HOST_READ_ONLY | CL_MEM_HOST_WRITE_ONLY | CL_MEM_HOST_NO_ACCESS);
  }
}

uint32_t Memory::NumDevicesWithP2P() {
  uint32_t devices = context_().devices().size();
  if (devices == 1) {
    // Add p2p devices for allocation
    devices = context_().devices().size() + context_().devices()[0]->P2PAccessDevices().size();
    if (devices > 1) {
      p2pAccess_ = true;
    }
  }
  return devices;
}

void Memory::initDeviceMemory() {
  deviceMemories_ = reinterpret_cast<DeviceMemory*>(reinterpret_cast<char*>(this) + sizeof(Memory));
  memset(deviceMemories_, 0, NumDevicesWithP2P() * sizeof(DeviceMemory));
}

// ================================================================================================
void Memory::resetAllocationState() {

  // Reset device memory allocation state
  for (size_t i = 0; i < context_().devices().size(); i++) {
    deviceAlloced_[context_().devices()[i]].store(AllocInit, std::memory_order_relaxed);
  }
}

// ================================================================================================
void* Memory::operator new(size_t size, const Context& context) {
  uint32_t devices = context.devices().size();
  if (devices == 1) {
    // Add p2p devices for allocation
    devices = context.devices().size() + context.devices()[0]->P2PAccessDevices().size();
  }

  return RuntimeObject::operator new(size + devices * sizeof(DeviceMemory));
}

void Memory::operator delete(void* p) { RuntimeObject::operator delete(p); }

void Memory::operator delete(void* p, const Context& context) { Memory::operator delete(p); }


void Memory::addSubBuffer(Memory* view) {
  amd::ScopedLock lock(lockMemoryOps());
  subBuffers_.push_back(view);
}

void Memory::removeSubBuffer(Memory* view) {
  amd::ScopedLock lock(lockMemoryOps());
  subBuffers_.remove(view);
}

bool Memory::allocHostMemory(void* initFrom, bool allocHostMem, bool forceCopy) {
  // Sanity checks (the parameters should have been prevalidated by the API)
  assert(!(flags_ & (CL_MEM_USE_HOST_PTR | CL_MEM_COPY_HOST_PTR) && (initFrom == NULL) &&
           !allocHostMem && !isSvmPtrCommited()));
  assert(
      !((initFrom != NULL) && !forceCopy &&
        !(flags_ & (CL_MEM_USE_HOST_PTR | CL_MEM_COPY_HOST_PTR | CL_MEM_EXTERNAL_PHYSICAL_AMD))));
  assert(!(flags_ & CL_MEM_COPY_HOST_PTR && flags_ & CL_MEM_USE_HOST_PTR));

  const std::vector<Device*>& devices = context_().devices();

  // This allocation is necessary to use coherency mechanism
  // for the initialization
  if (getMemFlags() & (CL_MEM_COPY_HOST_PTR | CL_MEM_ALLOC_HOST_PTR)) {
    allocHostMem = true;
  }

  // Did application request to use host memory?
  if (getMemFlags() & CL_MEM_USE_HOST_PTR) {
    setHostMem(initFrom);

    // Recalculate image size according to pitch
    Image* image = asImage();
    if (image != NULL) {
      if (image->getDims() < 3) {
        size_ = image->getRowPitch() * image->getHeight();
      } else {
        size_ = image->getSlicePitch() * image->getDepth();
      }
    }
  }
  // Allocate host memory buffer if needed.
  // @note: SVM host memory allocation should be done in the device backend
  else if (allocHostMem && !isInterop() && !(getMemFlags() & CL_MEM_SVM_FINE_GRAIN_BUFFER)) {
    if (!hostMemRef_.allocateMemory(size_, context_())) {
      DevLogError("Cannot allocate Host Memory Buffer \n");
      return false;
    }

    // Copy data to the backing store if the app has requested
    if (((flags_ & CL_MEM_COPY_HOST_PTR) || forceCopy) && (initFrom != NULL)) {
      copyToBackingStore(initFrom);
    }
  }

  if (allocHostMem && type_ == CL_MEM_OBJECT_PIPE) {
    // Initialize the pipe for a CPU device
    clk_pipe_t* pipe = reinterpret_cast<clk_pipe_t*>(getHostMem());
    pipe->read_idx = 0;
    pipe->write_idx = 0;
    pipe->end_idx = asPipe()->getMaxNumPackets();
  }

  if ((flags_ & (CL_MEM_USE_HOST_PTR | CL_MEM_COPY_HOST_PTR)) && (NULL == lastWriter_)) {
    // Signal write, so coherency mechanism will initialize
    // memory on all devices
    signalWrite(NULL);
  }

  return true;
}

// ================================================================================================
bool Memory::create(void* initFrom, bool sysMemAlloc, bool skipAlloc, bool forceAlloc) {
  static const bool forceAllocHostMem = false;

  // Sanity checks (can't defer and force allocations at the same time)
  assert(!(skipAlloc && forceAlloc));

  initDeviceMemory();

  // Check if it's a subbuffer allocation
  if (parent_ != NULL) {
    // Find host memory pointer for subbuffer
    if (parent_->getHostMem() != NULL) {
      setHostMem((address)parent_->getHostMem() + origin_);
    }

    // Add a new subbuffer to the list
    parent_->addSubBuffer(this);
  }
  // Allocate host memory if requested
  else if (!allocHostMemory(initFrom, forceAllocHostMem)) {
    DevLogError("Cannot allocate Host Memory \n");
    return false;
  }

  const std::vector<Device*>& devices = context_().devices();
  if (devices.size() == 1 && devices[0]->info().largeBar_) {
      largeBarSystem_ = 1;
  }

  // Forces system memory allocation on the device,
  // instead of device memory
  forceSysMemAlloc_ = sysMemAlloc;

  // Create memory on all available devices
  for (size_t i = 0; i < devices.size(); i++) {
    deviceAlloced_[devices[i]].store(AllocInit, std::memory_order_relaxed);

    deviceMemories_[i].ref_ = devices[i];
    deviceMemories_[i].value_ = NULL;

    if (forceAlloc ||
        (!skipAlloc &&
         ((devices.size() == 1) || DISABLE_DEFERRED_ALLOC))) {
      device::Memory* mem = getDeviceMemory(*devices[i]);
      if (NULL == mem) {
        LogPrintfError("Can't allocate memory size - 0x%08X bytes!", getSize());
        return false;
      }
    }
  }
  // Add a VA range into VA range map
  if (getMemFlags() & CL_MEM_VA_RANGE_AMD) {
    amd::MemObjMap::AddVirtualMemObj(getSvmPtr(), this);
  }
  // Store the unique id for each memory allocation
  uniqueId_ = ++numAllocs;
  return true;
}

// ================================================================================================
bool Memory::addDeviceMemory(const Device* dev) {
  bool result = false;
  AllocState create = AllocCreate;
  AllocState init = AllocInit;

  amd::ScopedLock lock(lockMemoryOps());
  if (deviceAlloced_[dev].compare_exchange_strong(init, create, std::memory_order_acq_rel)) {
    // Check if runtime already allocated all available slots for device memory
    if (numDevices() == NumDevicesWithP2P()) {
      // Mark the allocation as an empty
      deviceAlloced_[dev].store(AllocInit, std::memory_order_release);
      return result;
    }
    device::Memory* dm = dev->createMemory(*this);

    // Add the new memory allocation to the device map
    if (NULL != dm) {
      deviceMemories_[numDevices_].ref_ = dev;
      deviceMemories_[numDevices_].value_ = dm;
      numDevices_++;
      assert((numDevices() <= NumDevicesWithP2P()) && "Too many device objects");

      // Mark the allocation with the complete flag
      deviceAlloced_[dev] = AllocComplete;
      if (getSvmPtr() != nullptr) {
        svmBase_ = dm;
      }
    } else {
      LogError("Video memory allocation failed!");
      // Mark the allocation as an empty
      deviceAlloced_[dev].store(AllocInit, std::memory_order_release);
      return result;
    }
  }

  // Make sure runtime finished memory allocation.
  // Loop if in the create state
  while (deviceAlloced_[dev].load(std::memory_order_acquire) == AllocCreate) {
    Os::yield();
  }

  if (deviceAlloced_[dev].load(std::memory_order_acquire) == AllocComplete) {
    result = true;
  }

  return result;
}

void Memory::replaceDeviceMemory(const Device* dev, device::Memory* dm) {
  uint i;
  for (i = 0; i < numDevices_; ++i) {
    if (deviceMemories_[i].ref_ == dev) {
      delete deviceMemories_[i].value_;
      break;
    }
  }

  if (numDevices_ == 0) {
    ++numDevices_;
    deviceMemories_[0].ref_ = dev;
  }

  deviceMemories_[i].value_ = dm;
  deviceAlloced_[dev].store(AllocRealloced, std::memory_order_release);
}

device::Memory* Memory::getDeviceMemory(const Device& dev, bool alloc) {
  device::Memory* dm = NULL;
  for (uint i = 0; i < numDevices_; ++i) {
    if (deviceMemories_[i].ref_ == &dev) {
      dm = deviceMemories_[i].value_;
      break;
    }
  }

  if ((NULL == dm) && alloc) {
    if (!addDeviceMemory(&dev)) {
      return NULL;
    }
    dm = deviceMemories_[numDevices() - 1].value_;
  }

  return dm;
}

// ================================================================================================
Memory::~Memory() {
  // For_each destructor callback:
  DestructorCallBackEntry* entry;
  for (entry = destructorCallbacks_; entry != NULL; entry = entry->next_) {
    // invoke the callback function.
    entry->callback_(const_cast<cl_mem>(as_cl(this)), entry->data_);
  }

  // Release the parent.
  if (NULL != parent_) {
    // Update cache if runtime destroys a subbuffer
    if (NULL != parent_->getHostMem() && (vDev_ == NULL)) {
      cacheWriteBack(nullptr);
    }
    parent_->removeSubBuffer(this);
  }

  if (NULL != deviceMemories_) {
    // Destroy all device memory objects
    for (uint i = 0; i < numDevices_; ++i) {
      delete deviceMemories_[i].value_;
    }
  }

  // Sanity check
  if (subBuffers_.size() != 0) {
    LogError("Can't have views if parent is destroyed!");
  }

  // Destroy the destructor callback entries
  DestructorCallBackEntry* callback = destructorCallbacks_;
  while (callback != NULL) {
    DestructorCallBackEntry* next = callback->next_;
    delete callback;
    callback = next;
  }

  // Make sure runtime destroys the parent only after subbuffer destruction
  if (NULL != parent_) {
    parent_->release();
  }
  hostMemRef_.deallocateMemory(context_());
  if (getMemFlags() & CL_MEM_VA_RANGE_AMD) {
    amd::MemObjMap::RemoveVirtualMemObj(getSvmPtr());
    // If runtime executes graph mempool with VM, then VA can be mapped in space
    // for graph validation logic during execution. And the reason it's not unmaped
    // in graph itself because the app can have a graph without a free node
    if (amd::MemObjMap::FindMemObj(getSvmPtr())) {
      amd::MemObjMap::RemoveMemObj(getSvmPtr());
    }
  }
}

// ================================================================================================
bool Memory::setDestructorCallback(DestructorCallBackFunction callback, void* data) {
  DestructorCallBackEntry* entry = new DestructorCallBackEntry(callback, data);
  if (entry == NULL) {
    return false;
  }

  entry->next_ = destructorCallbacks_;
  while (!destructorCallbacks_.compare_exchange_weak(entry->next_, entry))
    ;  // Someone else is also updating the head of the linked list! reload.

  return true;
}

void Memory::signalWrite(const Device* writer) {
  // Disable cache coherency layer for HIP
  if (!amd::IS_HIP) {
    // (The potential race condition below doesn't matter, no critical
    // section needed)
    ++version_;
    lastWriter_ = writer;
    // Update all subbuffers for this object
    for (auto buf : subBuffers_) {
      buf->signalWrite(writer);
    }
  }
}

void Memory::cacheWriteBack(device::VirtualDevice* vDev) {
  if (NULL != lastWriter_) {
    device::Memory* dmem = getDeviceMemory(*lastWriter_);
    //! @note It's a special condition, when a subbuffer was created,
    //! but never used. Thus dev memory is still NULL and lastWriter_
    //! was passed from the parent.
    if (NULL != dmem) {
      dmem->syncHostFromCache(vDev);
    }
  } else if (isParent()) {
    // On CPU parent can't be synchronized, because lastWriter_ could be NULL
    // and syncHostFromCache() won't be called.
    for (uint i = 0; i < numDevices_; ++i) {
      deviceMemories_[i].value_->syncHostFromCache(vDev);
    }
  }
}

void Memory::copyToBackingStore(void* initFrom) { memcpy(getHostMem(), initFrom, size_); }

bool Memory::usesSvmPointer() const {
  if (!(flags_ & CL_MEM_USE_HOST_PTR)) {
    return false;
  }
  // If the application host pointer lies within a SVM region, so does the
  // sub-buffer host pointer - so the following check works in both cases
  return (SvmBuffer::malloced(getHostMem()) || NULL != svmHostAddress_);
}

void Memory::commitSvmMemory() {
  ScopedLock lock(lockMemoryOps_);
  // if VRAM is visible for host, it is not necessary to mmap again
  if (!svmPtrCommited_ && !largeBarSystem_) {
    amd::Os::commitMemory(svmHostAddress_, size_, amd::Os::MEM_PROT_RW);
    svmPtrCommited_ = true;
  }
}

void Memory::uncommitSvmMemory() {
  ScopedLock lock(lockMemoryOps_);
  if (svmPtrCommited_ && !(flags_ & CL_MEM_SVM_FINE_GRAIN_BUFFER)) {
    amd::Os::uncommitMemory(svmHostAddress_, size_);
    svmPtrCommited_ = false;
  }
}

void Buffer::initDeviceMemory() {
  deviceMemories_ = reinterpret_cast<DeviceMemory*>(reinterpret_cast<char*>(this) + sizeof(Buffer));
  memset(deviceMemories_, 0, NumDevicesWithP2P() * sizeof(DeviceMemory));
}

bool Buffer::create(void* initFrom, bool sysMemAlloc, bool skipAlloc, bool forceAlloc) {
  if ((getMemFlags() & CL_MEM_EXTERNAL_PHYSICAL_AMD) && (initFrom != NULL)) {
    busAddress_ = *(reinterpret_cast<cl_bus_address_amd*>(initFrom));
    initFrom = NULL;
  } else {
    busAddress_.surface_bus_address = 0;
    busAddress_.marker_bus_address = 0;
  }
  return Memory::create(initFrom, sysMemAlloc, skipAlloc, forceAlloc);
}

bool Buffer::isEntirelyCovered(const Coord3D& origin, const Coord3D& region) const {
  return ((origin[0] == 0) && (region[0] == getSize())) ? true : false;
}

bool Buffer::validateRegion(const Coord3D& origin, const Coord3D& region) const {
  return ((region[0] > 0) && (origin[0] < getSize()) && ((origin[0] + region[0]) <= getSize()))
      ? true
      : false;
}

void Pipe::initDeviceMemory() {
  deviceMemories_ = reinterpret_cast<DeviceMemory*>(reinterpret_cast<char*>(this) + sizeof(Pipe));
  memset(deviceMemories_, 0, NumDevicesWithP2P() * sizeof(DeviceMemory));
}

#define GETMIPDIM(dim, mip) (((dim >> mip) > 0) ? (dim >> mip) : 1)

Image::Image(const Format& format, Image& parent, uint baseMipLevel, cl_mem_flags flags)
    : Memory(parent, flags, 0,
             parent.getWidth() * parent.getHeight() * parent.getDepth() * format.getElementSize()),
      impl_(format, Coord3D(parent.getWidth() * parent.getImageFormat().getElementSize() /
                                format.getElementSize(),
                            parent.getHeight(), parent.getDepth()),
            parent.getRowPitch(), parent.getSlicePitch(), parent.getBytePitch()),
      mipLevels_(1),
      baseMipLevel_(baseMipLevel) {
  if (baseMipLevel > 0) {
    impl_.region_.c[0] = GETMIPDIM(parent.getWidth(), baseMipLevel) *
        parent.getImageFormat().getElementSize() / format.getElementSize();
    impl_.region_.c[1] = GETMIPDIM(parent.getHeight(), baseMipLevel);
    impl_.region_.c[2] = GETMIPDIM(parent.getDepth(), baseMipLevel);

    if (parent.getType() == CL_MEM_OBJECT_IMAGE1D_ARRAY) {
      impl_.region_.c[1] = parent.getHeight();
    } else if (parent.getType() == CL_MEM_OBJECT_IMAGE2D_ARRAY) {
      impl_.region_.c[2] = parent.getDepth();
    }
    size_ = getWidth() * getHeight() * parent.getDepth() * format.getElementSize();
  }
  initDimension();
}

Image::Image(Context& context, Type type, Flags flags, const Format& format, size_t width,
             size_t height, size_t depth, size_t rowPitch, size_t slicePitch, uint mipLevels)
    : Memory(context, type, flags, width * height * depth * format.getElementSize()),
      impl_(format, Coord3D(width, height, depth), rowPitch, slicePitch),
      mipLevels_(mipLevels),
      baseMipLevel_(0) {
  initDimension();
}

Image::Image(Buffer& buffer, Type type, Flags flags, const Format& format, size_t width,
             size_t height, size_t depth, size_t rowPitch, size_t slicePitch)
    : Memory(buffer, flags, 0, buffer.getSize(), type),
      impl_(format, Coord3D(width, height, depth), rowPitch, slicePitch),
      mipLevels_(1),
      baseMipLevel_(0) {
  initDimension();
}

bool Image::validateDimensions(const std::vector<amd::Device*>& devices, cl_mem_object_type type,
                               size_t width, size_t height, size_t depth, size_t arraySize) {
  bool sizePass = false;
  switch (type) {
    case CL_MEM_OBJECT_IMAGE3D:
      if ((width == 0) || (height == 0) || (depth < 1)) {
        DevLogPrintfError("Invalid Dimenstions, width: %u height: %u depth: %u \n",
                          width, height, depth);
        return false;
      }
      for (const auto& dev : devices) {
        if ((dev->info().image3DMaxWidth_ >= width) && (dev->info().image3DMaxHeight_ >= height) &&
            (dev->info().image3DMaxDepth_ >= depth)) {
          return true;
        }
      }
      break;
    case CL_MEM_OBJECT_IMAGE2D_ARRAY:
      if (arraySize == 0) {
        DevLogError("Array is empty \n");
        return false;
      }
      for (const auto& dev : devices) {
        if (dev->info().imageMaxArraySize_ >= arraySize) {
          sizePass = true;
          break;
        }
      }
      if (!sizePass) {
        DevLogPrintfError("Cannot allocate image of size: %u \n", arraySize);
        return false;
      }
    // Fall through...
    case CL_MEM_OBJECT_IMAGE2D:
      if ((width == 0) || (height == 0)) {
        DevLogPrintfError("Invalid dimensions width: %u height: %u \n", width, height);
        return false;
      }
      for (const auto dev : devices) {
        if ((dev->info().image2DMaxHeight_ >= height) && (dev->info().image2DMaxWidth_ >= width)) {
          return true;
        }
      }
      break;
    case CL_MEM_OBJECT_IMAGE1D_ARRAY:
      if (arraySize == 0) {
        DevLogError("Array size cannot be empty \n");
        return false;
      }

      for (const auto& dev : devices) {
        if (dev->info().imageMaxArraySize_ >= arraySize) {
          sizePass = true;
          break;
        }
      }
      if (!sizePass) {
        DevLogPrintfError("Cannot allocate image of size: %u \n", arraySize);
        return false;
      }
    // Fall through...
    case CL_MEM_OBJECT_IMAGE1D:
      if (width == 0) {
        DevLogError("Invalid dimension \n");
        return false;
      }
      for (const auto& dev : devices) {
        if (dev->info().image2DMaxWidth_ >= width) {
          return true;
        }
      }
      break;
    case CL_MEM_OBJECT_IMAGE1D_BUFFER:
      if (width == 0) {
        DevLogError("Invalid dimension \n");
        return false;
      }
      for (const auto& dev : devices) {
        if (dev->info().imageMaxBufferSize_ >= width) {
          return true;
        }
      }
      break;
    default:
      break;
  }

  DevLogError("Dimension Validation failed \n");
  return false;
}

void Image::initDimension() {
  const size_t elemSize = impl_.format_.getElementSize();
  if (impl_.rp_ == 0) {
    impl_.rp_ = impl_.region_[0] * elemSize;
  }
  switch (type_) {
    case CL_MEM_OBJECT_IMAGE3D:
    case CL_MEM_OBJECT_IMAGE2D_ARRAY:
      dim_ = 3;
      if (impl_.sp_ == 0) {
        impl_.sp_ = impl_.region_[0] * impl_.region_[1] * elemSize;
      }
      break;
    case CL_MEM_OBJECT_IMAGE2D:
    case CL_MEM_OBJECT_IMAGE1D_ARRAY:
      dim_ = 2;
      if ((impl_.sp_ == 0) && (type_ == CL_MEM_OBJECT_IMAGE1D_ARRAY)) {
        impl_.sp_ = impl_.rp_;
      }
      break;
    case CL_MEM_OBJECT_IMAGE1D:
    case CL_MEM_OBJECT_IMAGE1D_BUFFER:
    default:
      dim_ = 1;
      break;
  }
}

void Image::initDeviceMemory() {
  deviceMemories_ = reinterpret_cast<DeviceMemory*>(reinterpret_cast<char*>(this) + sizeof(Image));
  memset(deviceMemories_, 0, NumDevicesWithP2P() * sizeof(DeviceMemory));
}

size_t Image::Format::getNumChannels() const {
  switch (image_channel_order) {
    case CL_RG:
    case CL_RA:
      return 2;

    case CL_RGB:
    case CL_sRGB:
    case CL_sRGBx:
      return 3;

    case CL_RGBA:
    case CL_BGRA:
    case CL_ARGB:
    case CL_sRGBA:
    case CL_sBGRA:
      return 4;
  }
  return 1;
}

size_t Image::Format::getElementSize() const {
  size_t bytesPerPixel = getNumChannels();
  switch (image_channel_data_type) {
    case CL_SNORM_INT8:
    case CL_UNORM_INT8:
    case CL_SIGNED_INT8:
    case CL_UNSIGNED_INT8:
      break;

    case CL_UNORM_INT_101010:
      bytesPerPixel = 4;
      break;
    case CL_SIGNED_INT32:
    case CL_UNSIGNED_INT32:
    case CL_FLOAT:
      bytesPerPixel *= 4;
      break;

    default:
      bytesPerPixel *= 2;
      break;
  }
  return bytesPerPixel;
}

bool Image::Format::isValid() const {
  switch (image_channel_data_type) {
    case CL_SNORM_INT8:
    case CL_SNORM_INT16:
    case CL_UNORM_INT8:
    case CL_UNORM_INT16:
    case CL_UNORM_SHORT_565:
    case CL_UNORM_SHORT_555:
    case CL_UNORM_INT_101010:
    case CL_SIGNED_INT8:
    case CL_SIGNED_INT16:
    case CL_SIGNED_INT32:
    case CL_UNSIGNED_INT8:
    case CL_UNSIGNED_INT16:
    case CL_UNSIGNED_INT32:
    case CL_HALF_FLOAT:
    case CL_FLOAT:
      break;

    default: {
      DevLogPrintfError("Invalid Image format: %u \n",
                        image_channel_data_type);
      return false;
    }
  }

  switch (image_channel_order) {
    case CL_R:
    case CL_A:
    case CL_RG:
    case CL_RA:
    case CL_RGBA:
      break;

    case CL_INTENSITY:
    case CL_LUMINANCE:
      switch (image_channel_data_type) {
        case CL_SNORM_INT8:
        case CL_SNORM_INT16:
        case CL_UNORM_INT8:
        case CL_UNORM_INT16:
        case CL_HALF_FLOAT:
        case CL_FLOAT:
          break;

        default: {
          DevLogPrintfError("Invalid Luminance: %u \n",
                            image_channel_data_type);
          return false;
        }
      }
      break;

    case CL_RGB:
      switch (image_channel_data_type) {
        case CL_UNORM_SHORT_565:
        case CL_UNORM_SHORT_555:
        case CL_UNORM_INT_101010:
          break;

        default: {
          DevLogPrintfError("Invalid RGB: %u \n",
                         image_channel_data_type);
          return false;
        }
      }
      break;

    case CL_BGRA:
    case CL_ARGB:
      switch (image_channel_data_type) {
        case CL_SNORM_INT8:
        case CL_UNORM_INT8:
        case CL_SIGNED_INT8:
        case CL_UNSIGNED_INT8:
          break;

        default: {
          DevLogPrintfError("Invalid BGRA/ARGB: %u \n",
                         image_channel_data_type);
          return false;
        }
      }
      break;

    case CL_sRGB:
    case CL_sRGBx:
    case CL_sRGBA:
    case CL_sBGRA:
      switch (image_channel_data_type) {
        case CL_UNORM_INT8:
          break;
        default: {
          DevLogPrintfError("Invalid sBGRA: %u \n",
                         image_channel_data_type);
          return false;
        }
      }
      break;

    case CL_DEPTH:
      switch (image_channel_data_type) {
        case CL_UNORM_INT16:
        case CL_FLOAT:
          break;
        default: {
          DevLogPrintfError("Invalid CL Depth: %u \n",
                         image_channel_data_type);
          return false;
        }
      }
      break;

    default: {
      DevLogPrintfError("Invalid image_channel_order: %u \n",
                     image_channel_order);
      return false;
    }
  }
  return true;
}

// definition of list of supported formats
const cl_image_format Image::supportedFormats[]= {
    // R
    {CL_R, CL_SNORM_INT8},
    {CL_R, CL_SNORM_INT16},
    {CL_R, CL_UNORM_INT8},
    {CL_R, CL_UNORM_INT16},

    {CL_R, CL_SIGNED_INT8},
    {CL_R, CL_SIGNED_INT16},
    {CL_R, CL_SIGNED_INT32},
    {CL_R, CL_UNSIGNED_INT8},
    {CL_R, CL_UNSIGNED_INT16},
    {CL_R, CL_UNSIGNED_INT32},

    {CL_R, CL_HALF_FLOAT},
    {CL_R, CL_FLOAT},

    // A
    {CL_A, CL_SNORM_INT8},
    {CL_A, CL_SNORM_INT16},
    {CL_A, CL_UNORM_INT8},
    {CL_A, CL_UNORM_INT16},

    {CL_A, CL_SIGNED_INT8},
    {CL_A, CL_SIGNED_INT16},
    {CL_A, CL_SIGNED_INT32},
    {CL_A, CL_UNSIGNED_INT8},
    {CL_A, CL_UNSIGNED_INT16},
    {CL_A, CL_UNSIGNED_INT32},

    {CL_A, CL_HALF_FLOAT},
    {CL_A, CL_FLOAT},

    // RG
    {CL_RG, CL_SNORM_INT8},
    {CL_RG, CL_SNORM_INT16},
    {CL_RG, CL_UNORM_INT8},
    {CL_RG, CL_UNORM_INT16},

    {CL_RG, CL_SIGNED_INT8},
    {CL_RG, CL_SIGNED_INT16},
    {CL_RG, CL_SIGNED_INT32},
    {CL_RG, CL_UNSIGNED_INT8},
    {CL_RG, CL_UNSIGNED_INT16},
    {CL_RG, CL_UNSIGNED_INT32},

    {CL_RG, CL_HALF_FLOAT},
    {CL_RG, CL_FLOAT},

    // RGBA
    {CL_RGBA, CL_SNORM_INT8},
    {CL_RGBA, CL_SNORM_INT16},
    {CL_RGBA, CL_UNORM_INT8},
    {CL_RGBA, CL_UNORM_INT16},

    {CL_RGBA, CL_SIGNED_INT8},
    {CL_RGBA, CL_SIGNED_INT16},
    {CL_RGBA, CL_SIGNED_INT32},
    {CL_RGBA, CL_UNSIGNED_INT8},
    {CL_RGBA, CL_UNSIGNED_INT16},
    {CL_RGBA, CL_UNSIGNED_INT32},

    {CL_RGBA, CL_HALF_FLOAT},
    {CL_RGBA, CL_FLOAT},

    // ARGB
    {CL_ARGB, CL_SNORM_INT8},
    {CL_ARGB, CL_UNORM_INT8},
    {CL_ARGB, CL_SIGNED_INT8},
    {CL_ARGB, CL_UNSIGNED_INT8},

    // BGRA
    {CL_BGRA, CL_SNORM_INT8},
    {CL_BGRA, CL_UNORM_INT8},
    {CL_BGRA, CL_SIGNED_INT8},
    {CL_BGRA, CL_UNSIGNED_INT8},

    // LUMINANCE
    {CL_LUMINANCE, CL_SNORM_INT8},
    {CL_LUMINANCE, CL_SNORM_INT16},
    {CL_LUMINANCE, CL_UNORM_INT8},
    {CL_LUMINANCE, CL_UNORM_INT16},
    {CL_LUMINANCE, CL_HALF_FLOAT},
    {CL_LUMINANCE, CL_FLOAT},

    // INTENSITY
    {CL_INTENSITY, CL_SNORM_INT8},
    {CL_INTENSITY, CL_SNORM_INT16},
    {CL_INTENSITY, CL_UNORM_INT8},
    {CL_INTENSITY, CL_UNORM_INT16},
    {CL_INTENSITY, CL_HALF_FLOAT},
    {CL_INTENSITY, CL_FLOAT},

    // RGB
    {CL_RGB, CL_UNORM_INT_101010},

    // sRGB
    {CL_sRGBA, CL_UNORM_INT8},

    // DEPTH
    {CL_DEPTH, CL_UNORM_INT16},
    {CL_DEPTH, CL_FLOAT},
};

const uint32_t NUM_CHANNEL_ORDER_OF_RGB = 1;  // The number of channel orders of RGB at the end of
                                             // the table supportedFormats above and before sRGB and
                                             // depth.
const uint32_t NUM_CHANNEL_ORDER_OF_sRGB = 1;  // The number of channel orders of sRGB at the end of
                                              // the table supportedFormats above and before depth.
const uint32_t NUM_CHANNEL_ORDER_OF_DEPTH =
    2;  // The number of channel orders of DEPTH at the end of the table supportedFormats above.

// definition of list of supported RA formats
const cl_image_format Image::supportedFormatsRA[] = {
    {CL_RA, CL_SNORM_INT8},     {CL_RA, CL_SNORM_INT16},   {CL_RA, CL_UNORM_INT8},
    {CL_RA, CL_UNORM_INT16},    {CL_RA, CL_SIGNED_INT8},   {CL_RA, CL_SIGNED_INT16},
    {CL_RA, CL_SIGNED_INT32},   {CL_RA, CL_UNSIGNED_INT8}, {CL_RA, CL_UNSIGNED_INT16},
    {CL_RA, CL_UNSIGNED_INT32}, {CL_RA, CL_HALF_FLOAT},    {CL_RA, CL_FLOAT},
};

const cl_image_format Image::supportedDepthStencilFormats[] = {
    // DEPTH STENCIL
    {CL_DEPTH_STENCIL, CL_FLOAT},
    {CL_DEPTH_STENCIL, CL_UNORM_INT24}};

uint32_t Image::numSupportedFormats(const Context& context, cl_mem_object_type image_type,
                                   cl_mem_flags flags) {
  const std::vector<amd::Device*>& devices = context.devices();
  uint numFormats = sizeof(supportedFormats) / sizeof(cl_image_format);

  bool supportRA = false;
  bool supportDepthsRGB = false;
  bool supportDepthStencil = false;

  // Add RA if RA is supported.
  for (uint i = 0; i < devices.size(); i++) {
    if (devices[i]->settings().supportRA_) {
      supportRA = true;
    }
    if (devices[i]->settings().supportDepthsRGB_) {
      supportDepthsRGB = true;
    }
    if (devices[i]->settings().checkExtension(ClKhrGLDepthImages) &&
        (context.info().flags_ & Context::GLDeviceKhr)) {
      supportDepthStencil = true;
    }
  }

  if (supportDepthsRGB) {
    if ((image_type != CL_MEM_OBJECT_IMAGE2D) && (image_type != CL_MEM_OBJECT_IMAGE2D_ARRAY) &&
        (image_type != 0)) {
      numFormats -= NUM_CHANNEL_ORDER_OF_DEPTH;  // substract channel order of DEPTH type.
    }
    // Currently we are not supported sRGB for write_imagef (extension cl_khr_srgb_image_writes)
    if ((image_type == CL_MEM_OBJECT_IMAGE1D_BUFFER) ||
        ((flags & (CL_MEM_WRITE_ONLY | CL_MEM_READ_WRITE | CL_MEM_KERNEL_READ_AND_WRITE)) != 0)) {
      numFormats -= NUM_CHANNEL_ORDER_OF_sRGB;
    }
  } else {
    numFormats -= NUM_CHANNEL_ORDER_OF_RGB;    // substract channel order of RGB type.
    numFormats -= NUM_CHANNEL_ORDER_OF_sRGB;   // substract channel order of sRGB type.
    numFormats -= NUM_CHANNEL_ORDER_OF_DEPTH;  // substract channel order of DEPTH type.
  }

  // Add RA if RA is supported. RA isn't supported on SI.
  if (supportRA) {
    numFormats +=
        sizeof(supportedFormatsRA) / sizeof(cl_image_format);  // Add channel order of RA type.
  }

  if (supportDepthStencil) {
    if (flags & CL_MEM_READ_ONLY) {
      numFormats += sizeof(supportedDepthStencilFormats) / sizeof(cl_image_format);
    }
  }

  return numFormats;
}

uint32_t Image::getSupportedFormats(const Context& context, cl_mem_object_type image_type,
                                   const uint32_t num_entries, cl_image_format* image_formats,
                                   cl_mem_flags flags) {
  const std::vector<amd::Device*>& devices = context.devices();
  uint numFormats = 0;

  bool supportRA = false;
  bool supportDepthsRGB = false;
  bool supportDepthStencil = false;

  // Add RA if RA is supported.
  for (uint i = 0; i < devices.size(); i++) {
    if (devices[i]->settings().supportRA_) {
      supportRA = true;
    }
    if (devices[i]->settings().supportDepthsRGB_) {
      supportDepthsRGB = true;
    }
    if (devices[i]->settings().checkExtension(ClKhrGLDepthImages) &&
        (context.info().flags_ & Context::GLDeviceKhr)) {
      supportDepthStencil = true;
    }
  }

  cl_image_format* format = image_formats;
  uint numSupportedFormats = sizeof(supportedFormats) / sizeof(cl_image_format);

  bool srgbWriteSupported = true;
  if (supportDepthsRGB) {
    if ((image_type != CL_MEM_OBJECT_IMAGE2D) && (image_type != CL_MEM_OBJECT_IMAGE2D_ARRAY) &&
        (image_type != 0)) {
      numSupportedFormats -= NUM_CHANNEL_ORDER_OF_DEPTH;
    }
    // Currently we are not supported sRGB for write_imagef (extension cl_khr_srgb_image_writes)
    if ((image_type == CL_MEM_OBJECT_IMAGE1D_BUFFER) ||
        ((flags & (CL_MEM_WRITE_ONLY | CL_MEM_READ_WRITE | CL_MEM_KERNEL_READ_AND_WRITE)) != 0)) {
      srgbWriteSupported = false;
    }
  } else {
    numSupportedFormats -= NUM_CHANNEL_ORDER_OF_RGB;    // substract channel order of RGB type.
    numSupportedFormats -= NUM_CHANNEL_ORDER_OF_sRGB;   // substract channel order of sRGB type.
    numSupportedFormats -= NUM_CHANNEL_ORDER_OF_DEPTH;  // substract channel order of DEPTH type.
  }

  for (uint i = 0; i < numSupportedFormats; i++) {
    if (numFormats == num_entries) {
      break;
    }
    if (!srgbWriteSupported) {
      if ((amd::Image::supportedFormats[i].image_channel_order == CL_sRGBA) ||
          (amd::Image::supportedFormats[i].image_channel_order == CL_sRGB) ||
          (amd::Image::supportedFormats[i].image_channel_order == CL_sRGBx) ||
          (amd::Image::supportedFormats[i].image_channel_order == CL_sBGRA)) {
        continue;
      }
    }
    *format++ = amd::Image::supportedFormats[i];
    numFormats++;
  }

  // Add RA if RA is supported.
  if (supportRA) {
    for (uint i = 0; i < sizeof(supportedFormatsRA) / sizeof(cl_image_format); i++) {
      if (numFormats == num_entries) {
        break;
      }
      *format++ = amd::Image::supportedFormatsRA[i];
      numFormats++;
    }
  }

  if (supportDepthStencil) {
    if (flags & CL_MEM_READ_ONLY) {
      for (uint i = 0; i < sizeof(supportedDepthStencilFormats) / sizeof(cl_image_format); i++) {
        if (numFormats == num_entries) {
          break;
        }
        *format++ = amd::Image::supportedDepthStencilFormats[i];
        numFormats++;
      }
    }
  }
  return numFormats;
}

bool Image::Format::isSupported(const Context& context, cl_mem_object_type image_type,
                                cl_mem_flags flags) const {
  const cl_image_format RGBA10 = {CL_RGBA, CL_UNORM_INT_101010};

  uint numFormats = numSupportedFormats(context, image_type, flags);

  std::vector<cl_image_format> image_formats(numFormats);

  getSupportedFormats(context, image_type, numFormats, image_formats.data(), flags);

  for (uint i = 0; i < numFormats; i++) {
    if (*this == image_formats[i]) {
      return true;
    }
  }
  if (*this == RGBA10) {
      return true;
  }

  return false;
}

// ================================================================================================
Image* Image::createView(const Context& context, const Format& format, device::VirtualDevice* vDev,
                         uint baseMipLevel, cl_mem_flags flags) {

  // Find the image dimensions and create a corresponding object
  Image* view = new (context) Image(format, *this, baseMipLevel, flags);

  if (view != nullptr) {
    // Set GPU virtual device for this view
    view->setVirtualDevice(vDev);

    view->resetAllocationState();

    // Initialize array of the device memory pointers
    view->initDeviceMemory();

    // Check if runtime has to allocate memory
    if ((context.devices().size() == 1) || DISABLE_DEFERRED_ALLOC) {
      for (uint i = 0; i < numDevices_; ++i) {
        // Make sure the parent's device memory is avaialbe
        if ((deviceMemories_[i].ref_ != nullptr) &&
            (deviceMemories_[i].value_ != nullptr)) {
          device::Memory* mem = view->getDeviceMemory(*(deviceMemories_[i].ref_));
        }
      }
    }
  }

  return view;
}

// ================================================================================================
bool Image::isEntirelyCovered(const Coord3D& origin, const Coord3D& region) const {
  return (origin[0] == 0 && origin[1] == 0 && origin[2] == 0 && region[0] == getWidth() &&
          region[1] == getHeight() && region[2] == getDepth())
      ? true
      : false;
}

bool Image::validateRegion(const Coord3D& origin, const Coord3D& region) const {
  return ((region[0] > 0) && (region[1] > 0) && (region[2] > 0) && (origin[0] < getWidth()) &&
          (region[0] != 0) && (origin[1] < getHeight()) && (region[1] != 0) &&
          (origin[2] < getDepth()) && (region[2] != 0) && ((origin[0] + region[0]) <= getWidth()) &&
          ((origin[1] + region[1]) <= getHeight()) && ((origin[2] + region[2]) <= getDepth()))
      ? true
      : false;
}

bool Image::isRowSliceValid(size_t rowPitch, size_t slice, size_t width, size_t height) const {
  size_t tmpHeight = (getType() == CL_MEM_OBJECT_IMAGE1D_ARRAY) ? 1 : height;

  bool valid = (rowPitch == 0) ||
      ((rowPitch != 0) && (rowPitch >= width * getImageFormat().getElementSize()));

  return ((slice == 0) || ((slice != 0) && (slice >= rowPitch * tmpHeight))) ? valid : false;
}

void Image::copyToBackingStore(void* initFrom) {
  char* dst = reinterpret_cast<char*>(getHostMem());
  size_t cpySize = getWidth() * getImageFormat().getElementSize();

  for (uint z = 0; z < getDepth(); ++z) {
    char* src = reinterpret_cast<char*>(initFrom) + z * getSlicePitch();
    for (uint y = 0; y < getHeight(); ++y) {
      memcpy(dst, src, cpySize);
      dst += cpySize;
      src += getRowPitch();
    }
  }

  impl_.rp_ = cpySize;
  if (impl_.sp_ != 0) {
    impl_.sp_ = impl_.rp_;
    if (getDims() == 3) {
      impl_.sp_ *= getHeight();
    }
  }
}

static int round_to_even(float v) {
  // clamp overflow
  if (v >= -(float)std::numeric_limits<int>::min()) {
    return std::numeric_limits<int>::max();
  }
  if (v <= (float)std::numeric_limits<int>::min()) {
    return std::numeric_limits<int>::min();
  }
  static const unsigned int magic[2] = {0x4b000000u, 0xcb000000u};

  // round fractional values to integer value
  if (fabsf(v) < *reinterpret_cast<const float*>(&magic[0])) {
    float magicVal = *reinterpret_cast<const float*>(&magic[v < 0.0f]);
    v += magicVal;
    v -= magicVal;
  }

  return static_cast<int>(v);
}

static uint16_t float2half_rtz(float f) {
  union {
    float f;
    uint32_t u;
  } u = {f};
  uint32_t sign = (u.u >> 16) & 0x8000;
  float x = fabsf(f);

  // Nan
  if (x != x) {
    u.u >>= (24 - 11);
    u.u &= 0x7fff;
    u.u |= 0x0200;  // silence the NaN
    return u.u | sign;
  }
  int values[5] = {0x47800000, 0x33800000, 0x38800000, 0x4b800000, 0x7f800000};
  // overflow
  if (x >= *reinterpret_cast<float*>(&values[0])) {
    if (x == *reinterpret_cast<float*>(&values[4])) {
      return 0x7c00 | sign;
    }
    return 0x7bff | sign;
  }

  // underflow
  if (x < *reinterpret_cast<float*>(&values[1])) {
    return sign;  // The halfway case can return 0x0001 or 0. 0 is even.
  }

  // half denormal
  if (x < *reinterpret_cast<float*>(&values[2])) {
    x *= *reinterpret_cast<float*>(&values[3]);
    return static_cast<uint16_t>((int)x | sign);
  }

  u.u &= 0xFFFFE000U;
  u.u -= 0x38000000U;

  return (u.u >> (24 - 11)) | sign;
}

void Image::Format::getChannelOrder(uint8_t* channelOrder) const {
  enum { CH_ORDER_R = 0, CH_ORDER_G, CH_ORDER_B, CH_ORDER_A };
  switch (image_channel_order) {
    case CL_A:
      channelOrder[0] = CH_ORDER_A;
      break;

    case CL_RA:
      channelOrder[0] = CH_ORDER_R;
      channelOrder[1] = CH_ORDER_A;
      break;

    case CL_BGRA:
      channelOrder[0] = CH_ORDER_B;
      channelOrder[1] = CH_ORDER_G;
      channelOrder[2] = CH_ORDER_R;
      channelOrder[3] = CH_ORDER_A;
      break;

    case CL_ARGB:
      channelOrder[0] = CH_ORDER_A;
      channelOrder[1] = CH_ORDER_R;
      channelOrder[2] = CH_ORDER_G;
      channelOrder[3] = CH_ORDER_B;
      break;

    default:
      channelOrder[0] = CH_ORDER_R;
      channelOrder[1] = CH_ORDER_G;
      channelOrder[2] = CH_ORDER_B;
      channelOrder[3] = CH_ORDER_A;
      break;
  }
}

// "colorRGBA" is a four component RGBA floating-point color value if the image
// channel data type is not an unnormalized signed and unsigned integer type,
// is a four component signed integer value if the image channel data type is
// an unnormalized signed integer type and is a four component unsigned integer
// value if the image channel data type is an unormalized unsigned integer type.
void Image::Format::formatColor(const void* colorRGBA, void* colorFormat) const {
  union t565 {
    struct {
      uint16_t r_ : 5;
      uint16_t g_ : 6;
      uint16_t b_ : 5;
    };
    uint16_t rgba_;
  };

  union t555 {
    struct {
      uint16_t r_ : 5;
      uint16_t g_ : 5;
      uint16_t b_ : 5;
      uint16_t a_ : 1;
    };
    uint16_t rgba_;
  };

  union t101010 {
    struct {
      uint32_t b_ : 10;
      uint32_t g_ : 10;
      uint32_t r_ : 10;
      uint32_t a_ : 2;
    };
    uint32_t rgba_;
  };

  const float* colorRGBAf = reinterpret_cast<const float*>(colorRGBA);
  const int32_t* colorRGBAi = reinterpret_cast<const int32_t*>(colorRGBA);
  const uint32_t* colorRGBAui = reinterpret_cast<const uint32_t*>(colorRGBA);

  size_t chCount = getNumChannels();
  uint8_t chOrder[4];
  getChannelOrder(chOrder);

  bool allChannels = false;
  for (size_t i = 0; i < chCount && !allChannels; ++i) {
    switch (image_channel_data_type) {
      case CL_SNORM_INT8: {
        int8_t* color = reinterpret_cast<int8_t*>(colorFormat);
        color[i] = round_to_even(INT8_MAX * colorRGBAf[chOrder[i]]);
      } break;
      case CL_SNORM_INT16: {
        int16_t* color = reinterpret_cast<int16_t*>(colorFormat);
        color[i] = round_to_even(INT16_MAX * colorRGBAf[chOrder[i]]);
      } break;
      case CL_UNORM_INT8: {
        uint8_t* color = reinterpret_cast<uint8_t*>(colorFormat);
        color[i] = round_to_even(UINT8_MAX * colorRGBAf[chOrder[i]]);
      } break;
      case CL_UNORM_INT16: {
        uint16_t* color = reinterpret_cast<uint16_t*>(colorFormat);
        color[i] = round_to_even(UINT16_MAX * colorRGBAf[chOrder[i]]);
      } break;
      case CL_UNORM_SHORT_565: {
        t565* color = reinterpret_cast<t565*>(colorFormat);
        color->r_ = round_to_even(0x1F * colorRGBAf[0]);
        color->g_ = round_to_even(0x3F * colorRGBAf[1]);
        color->b_ = round_to_even(0x1F * colorRGBAf[2]);
        allChannels = true;
      } break;
      case CL_UNORM_SHORT_555: {
        t555* color = reinterpret_cast<t555*>(colorFormat);
        color->r_ = round_to_even(0x1F * colorRGBAf[0]);
        color->g_ = round_to_even(0x1F * colorRGBAf[1]);
        color->b_ = round_to_even(0x1F * colorRGBAf[2]);
        color->a_ = round_to_even(colorRGBAf[3]);
        allChannels = true;
      } break;
      case CL_UNORM_INT_101010: {
        t101010* color = reinterpret_cast<t101010*>(colorFormat);
        color->r_ = round_to_even(0x3FF * colorRGBAf[0]);
        color->g_ = round_to_even(0x3FF * colorRGBAf[1]);
        color->b_ = round_to_even(0x3FF * colorRGBAf[2]);
        color->a_ = round_to_even(0x3 * colorRGBAf[3]);
        allChannels = true;
      } break;
      case CL_SIGNED_INT8: {
        int8_t* color = reinterpret_cast<int8_t*>(colorFormat);
        color[i] = colorRGBAi[chOrder[i]];
      } break;
      case CL_SIGNED_INT16: {
        int16_t* color = reinterpret_cast<int16_t*>(colorFormat);
        color[i] = colorRGBAi[chOrder[i]];
      } break;
      case CL_SIGNED_INT32: {
        int32_t* color = reinterpret_cast<int32_t*>(colorFormat);
        color[i] = colorRGBAi[chOrder[i]];
      } break;
      case CL_UNSIGNED_INT8: {
        uint8_t* color = reinterpret_cast<uint8_t*>(colorFormat);
        color[i] = colorRGBAui[chOrder[i]];
      } break;
      case CL_UNSIGNED_INT16: {
        uint16_t* color = reinterpret_cast<uint16_t*>(colorFormat);
        color[i] = colorRGBAui[chOrder[i]];
      } break;
      case CL_UNSIGNED_INT32: {
        uint32_t* color = reinterpret_cast<uint32_t*>(colorFormat);
        color[i] = colorRGBAui[chOrder[i]];
      } break;
      case CL_HALF_FLOAT: {
        uint16_t* color = reinterpret_cast<uint16_t*>(colorFormat);
        color[i] = float2half_rtz(colorRGBAf[chOrder[i]]);
      } break;
      case CL_FLOAT: {
        float* color = reinterpret_cast<float*>(colorFormat);
        color[i] = colorRGBAf[chOrder[i]];
      } break;
    }
  }
}

Monitor SvmBuffer::AllocatedLock_ ROCCLR_INIT_PRIORITY(101) ("Guards SVM allocation list");
std::map<uintptr_t, uintptr_t> SvmBuffer::Allocated_ ROCCLR_INIT_PRIORITY(101);

void SvmBuffer::Add(uintptr_t k, uintptr_t v) {
  ScopedLock lock(AllocatedLock_);
  Allocated_.insert(std::pair<uintptr_t, uintptr_t>(k, v));
}

void SvmBuffer::Remove(uintptr_t k) {
  ScopedLock lock(AllocatedLock_);
  Allocated_.erase(k);
}

bool SvmBuffer::Contains(uintptr_t ptr) {
  ScopedLock lock(AllocatedLock_);
  auto it = Allocated_.upper_bound(ptr);
  if (it == Allocated_.begin()) {
    return false;
  }
  --it;
  return ptr >= it->first && ptr < it->second;
}

// The allocation flags are ignored for now.
void* SvmBuffer::malloc(Context& context, cl_svm_mem_flags flags, size_t size, size_t alignment,
                        const amd::Device* curDev) {
  bool atomics = (flags & CL_MEM_SVM_ATOMICS) != 0;
  void* ret = context.svmAlloc(size, alignment, flags, curDev);
  if (ret == nullptr) {
    LogError("Unable to allocate aligned memory");
    return nullptr;
  }
  uintptr_t ret_u = reinterpret_cast<uintptr_t>(ret);
  Add(ret_u, ret_u + size);
  return ret;
}

void SvmBuffer::free(const Context& context, void* ptr) {
  Remove(reinterpret_cast<uintptr_t>(ptr));
  context.svmFree(ptr);
}

void SvmBuffer::memFill(void* dst, const void* src, size_t srcSize, size_t times) {
  address dstAddress = reinterpret_cast<address>(dst);
  const_address srcAddress = reinterpret_cast<const_address>(src);
  for (size_t i = 0; i < times; i++) {
    ::memcpy(dstAddress + i * srcSize, srcAddress, srcSize);
  }
}

// ================================================================================================
bool SvmBuffer::malloced(const void* ptr) { return Contains(reinterpret_cast<uintptr_t>(ptr)); }

// ================================================================================================
void IpcBuffer::initDeviceMemory() {
  deviceMemories_ =
      reinterpret_cast<DeviceMemory*>(reinterpret_cast<char*>(this) + sizeof(IpcBuffer));
  memset(deviceMemories_, 0, NumDevicesWithP2P() * sizeof(DeviceMemory));
}

}  // namespace amd