// // Copyright 2010 Advanced Micro Devices, Inc. All rights reserved. // #include "amdocl/cl_common.hpp" #include "os/alloc.hpp" #include "platform/context.hpp" #include "platform/object.hpp" #include "platform/memory.hpp" #include "device/device.hpp" namespace amd { bool BufferRect::create(const size_t* bufferOrigin, const size_t* region, size_t bufferRowPitch, size_t bufferSlicePitch) { bool valid = false; // Find the buffer's row pitch rowPitch_ = (bufferRowPitch != 0) ? bufferRowPitch : region[0]; // Find the buffer's slice pitch slicePitch_ = (bufferSlicePitch != 0) ? bufferSlicePitch : rowPitch_ * region[1]; // Find the region start offset start_ = bufferOrigin[2] * slicePitch_ + bufferOrigin[1] * rowPitch_ + bufferOrigin[0]; // Find the region relative end offset end_ = (region[2] - 1) * slicePitch_ + (region[1] - 1) * rowPitch_ + region[0]; // Make sure we have a valid region if ((rowPitch_ >= region[0]) && (slicePitch_ >= (region[1] * rowPitch_)) && ((slicePitch_ % rowPitch_) == 0)) { valid = true; } return valid; } bool HostMemoryReference::allocateMemory(size_t size, const Context& context) { assert(!alloced_ && "Runtime should not reallocate system memory!"); size_t memoryAlignment = (CPU_MEMORY_ALIGNMENT_SIZE <= 0) ? 256 : CPU_MEMORY_ALIGNMENT_SIZE; size_ = amd::alignUp(size, memoryAlignment); //! \note memory size must be aligned for CAL pinning hostMem_ = CPU_MEMORY_GUARD_PAGES ? GuardedMemory::allocate(size_, MEMOBJ_BASE_ADDR_ALIGN, CPU_MEMORY_GUARD_PAGE_SIZE * Ki) : context.hostAlloc(size_, MEMOBJ_BASE_ADDR_ALIGN); alloced_ = (hostMem_ != NULL); return alloced_; } // Frees system memory if it was allocated void HostMemoryReference::deallocateMemory(const Context& context) { if (alloced_) { if (CPU_MEMORY_GUARD_PAGES) GuardedMemory::deallocate(hostMem_); else context.hostFree(hostMem_); size_ = 0; alloced_ = false; hostMem_ = NULL; } } Memory::Memory(Context& context, Type type, Flags flags, size_t size, void* svmPtr) : numDevices_(0), deviceMemories_(NULL), destructorCallbacks_(NULL), context_(context), parent_(NULL), type_(type), hostMemRef_(NULL), origin_(0), size_(size), flags_(flags), version_(0), lastWriter_(NULL), interopObj_(NULL), isParent_(false), vDev_(NULL), forceSysMemAlloc_(false), mapCount_(0), svmHostAddress_(svmPtr), svmPtrCommited_(flags & CL_MEM_SVM_FINE_GRAIN_BUFFER ? true : false), canBeCached_(true), lockMemoryOps_("Memory Ops Lock", true) {} Memory::Memory(Memory& parent, Flags flags, size_t origin, size_t size, Type type) : numDevices_(0), deviceMemories_(NULL), destructorCallbacks_(NULL), context_(parent.getContext()), parent_(&parent), type_((type == 0) ? parent.type_ : type), hostMemRef_(NULL), origin_(origin), size_(size), flags_(flags), version_(parent.getVersion()), lastWriter_(parent.getLastWriter()), interopObj_(parent.getInteropObj()), isParent_(false), vDev_(NULL), forceSysMemAlloc_(false), mapCount_(0), svmHostAddress_(parent.getSvmPtr()), svmPtrCommited_(parent.isSvmPtrCommited()), canBeCached_(true), lockMemoryOps_("Memory Ops Lock", true) { parent_->retain(); parent_->isParent_ = true; if (parent.getHostMem() != nullptr) { setHostMem(reinterpret_cast
(parent.getHostMem()) + origin); } // Inherit memory flags from the parent if ((flags_ & (CL_MEM_READ_WRITE | CL_MEM_READ_ONLY | CL_MEM_WRITE_ONLY)) == 0) { flags_ |= parent_->getMemFlags() & (CL_MEM_READ_WRITE | CL_MEM_READ_ONLY | CL_MEM_WRITE_ONLY); } flags_ |= parent_->getMemFlags() & (CL_MEM_ALLOC_HOST_PTR | CL_MEM_COPY_HOST_PTR | CL_MEM_USE_HOST_PTR); if ((flags_ & (CL_MEM_HOST_READ_ONLY | CL_MEM_HOST_WRITE_ONLY | CL_MEM_HOST_NO_ACCESS)) == 0) { flags_ |= parent_->getMemFlags() & (CL_MEM_HOST_READ_ONLY | CL_MEM_HOST_WRITE_ONLY | CL_MEM_HOST_NO_ACCESS); } } void Memory::initDeviceMemory() { deviceMemories_ = reinterpret_cast(reinterpret_cast(this) + sizeof(Memory)); memset(deviceMemories_, 0, context_().devices().size() * sizeof(DeviceMemory)); } void* Memory::operator new(size_t size, const Context& context) { return RuntimeObject::operator new(size + context.devices().size() * sizeof(DeviceMemory)); } void Memory::operator delete(void* p) { RuntimeObject::operator delete(p); } void Memory::operator delete(void* p, const Context& context) { Memory::operator delete(p); } void Memory::addSubBuffer(Memory* view) { amd::ScopedLock lock(lockMemoryOps()); subBuffers_.push_back(view); } void Memory::removeSubBuffer(Memory* view) { amd::ScopedLock lock(lockMemoryOps()); subBuffers_.remove(view); } bool Memory::allocHostMemory(void* initFrom, bool allocHostMem, bool forceCopy) { // Sanity checks (the parameters should have been prevalidated by the API) assert(!(flags_ & (CL_MEM_USE_HOST_PTR | CL_MEM_COPY_HOST_PTR) && (initFrom == NULL) && !allocHostMem && !isSvmPtrCommited())); assert( !((initFrom != NULL) && !forceCopy && !(flags_ & (CL_MEM_USE_HOST_PTR | CL_MEM_COPY_HOST_PTR | CL_MEM_EXTERNAL_PHYSICAL_AMD)))); assert(!(flags_ & CL_MEM_COPY_HOST_PTR && flags_ & CL_MEM_USE_HOST_PTR)); const std::vector& devices = context_().devices(); // This allocation is necessary to use coherency mechanism // for the initialization if (getMemFlags() & (CL_MEM_COPY_HOST_PTR | CL_MEM_ALLOC_HOST_PTR)) { allocHostMem = true; } // Did application request to use host memory? if (getMemFlags() & CL_MEM_USE_HOST_PTR) { setHostMem(initFrom); // Recalculate image size according to pitch Image* image = asImage(); if (image != NULL) { if (image->getDims() < 3) { size_ = image->getRowPitch() * image->getHeight(); } else { size_ = image->getSlicePitch() * image->getDepth(); } } } // Allocate host memory buffer if needed else if (allocHostMem && !isInterop()) { if (!hostMemRef_.allocateMemory(size_, context_())) { return false; } // Copy data to the backing store if the app has requested if (((flags_ & CL_MEM_COPY_HOST_PTR) || forceCopy) && (initFrom != NULL)) { copyToBackingStore(initFrom); } } if (allocHostMem && type_ == CL_MEM_OBJECT_PIPE) { // Initialize the pipe for a CPU device clk_pipe_t* pipe = reinterpret_cast(getHostMem()); pipe->read_idx = 0; pipe->write_idx = 0; pipe->end_idx = asPipe()->getMaxNumPackets(); } if ((flags_ & (CL_MEM_USE_HOST_PTR | CL_MEM_COPY_HOST_PTR)) && (NULL == lastWriter_)) { // Signal write, so coherency mechanism will initialize // memory on all devices signalWrite(NULL); } return true; } bool Memory::create(void* initFrom, bool sysMemAlloc, bool skipAlloc) { static const bool forceAllocHostMem = false; initDeviceMemory(); // Check if it's a subbuffer allocation if (parent_ != NULL) { // Find host memory pointer for subbuffer if (parent_->getHostMem() != NULL) { setHostMem((address)parent_->getHostMem() + origin_); } // Add a new subbuffer to the list parent_->addSubBuffer(this); } // Allocate host memory if requested else if (!allocHostMemory(initFrom, forceAllocHostMem)) { return false; } const std::vector& devices = context_().devices(); // Forces system memory allocation on the device, // instead of device memory forceSysMemAlloc_ = sysMemAlloc; // Create memory on all available devices for (size_t i = 0; i < devices.size(); i++) { deviceAlloced_[devices[i]] = AllocInit; deviceMemories_[i].ref_ = devices[i]; deviceMemories_[i].value_ = NULL; if (!skipAlloc && ((devices.size() == 1) || DISABLE_DEFERRED_ALLOC)) { device::Memory* mem = getDeviceMemory(*devices[i]); if (NULL == mem) { LogPrintfError("Can't allocate memory size - 0x%08X bytes!", getSize()); return false; } } } return true; } bool Memory::addDeviceMemory(const Device* dev) { bool result = false; AllocState create = AllocCreate; AllocState init = AllocInit; if (make_atomic(deviceAlloced_[dev]).compareAndSet(init, create)) { device::Memory* dm = dev->createMemory(*this); // Add the new memory allocation to the device map if (NULL != dm) { deviceMemories_[numDevices_].ref_ = dev; deviceMemories_[numDevices_].value_ = dm; numDevices_++; assert((numDevices() <= context_().devices().size()) && "Too many device objects"); // Mark the allocation with the complete flag deviceAlloced_[dev] = AllocComplete; if (getSvmPtr() != nullptr) { svmBase_ = dm; } } else { // Mark the allocation as an empty deviceAlloced_[dev] = AllocInit; } } // Make sure runtime finished memory allocation. // Loop if in the create state while (deviceAlloced_[dev] == AllocCreate) { Os::yield(); } if (deviceAlloced_[dev] == AllocComplete) { result = true; } return result; } void Memory::replaceDeviceMemory(const Device* dev, device::Memory* dm) { uint i; for (i = 0; i < numDevices_; ++i) { if (deviceMemories_[i].ref_ == dev) { delete deviceMemories_[i].value_; break; } } if (numDevices_ == 0) { ++numDevices_; deviceMemories_[0].ref_ = dev; } deviceMemories_[i].value_ = dm; deviceAlloced_[dev] = AllocRealloced; } device::Memory* Memory::getDeviceMemory(const Device& dev, bool alloc) { device::Memory* dm = NULL; for (uint i = 0; i < numDevices_; ++i) { if (deviceMemories_[i].ref_ == &dev) { dm = deviceMemories_[i].value_; break; } } if ((NULL == dm) && alloc) { if (!addDeviceMemory(&dev)) { LogError("Video memory allocation failed!"); return NULL; } dm = deviceMemories_[numDevices() - 1].value_; } return dm; } Memory::~Memory() { // For_each destructor callback: DestructorCallBackEntry* entry; for (entry = destructorCallbacks_; entry != NULL; entry = entry->next_) { // invoke the callback function. entry->callback_(const_cast(as_cl(this)), entry->data_); } // Release the parent. if (NULL != parent_) { // Update cache if runtime destroys a subbuffer if (NULL != parent_->getHostMem() && (vDev_ == NULL)) { cacheWriteBack(); } parent_->removeSubBuffer(this); } if (NULL != deviceMemories_) { // Destroy all device memory objects for (uint i = 0; i < numDevices_; ++i) { delete deviceMemories_[i].value_; } } // Sanity check if (subBuffers_.size() != 0) { LogError("Can't have views if parent is destroyed!"); } // Destroy the destructor callback entries DestructorCallBackEntry* callback = destructorCallbacks_; while (callback != NULL) { DestructorCallBackEntry* next = callback->next_; delete callback; callback = next; } // Make sure runtime destroys the parent only after subbuffer destruction if (NULL != parent_) { parent_->release(); } hostMemRef_.deallocateMemory(context_()); } bool Memory::setDestructorCallback(DestructorCallBackFunction callback, void* data) { DestructorCallBackEntry* entry = new DestructorCallBackEntry(callback, data); if (entry == NULL) { return false; } entry->next_ = destructorCallbacks_; while (!destructorCallbacks_.compare_exchange_weak(entry->next_, entry)) ; // Someone else is also updating the head of the linked list! reload. return true; } void Memory::signalWrite(const Device* writer) { // (the potential race condition below doesn't matter, no critical // section needed) ++version_; lastWriter_ = writer; // Update all subbuffers for this object for (auto buf : subBuffers_) { buf->signalWrite(writer); } } void Memory::cacheWriteBack() { if (NULL != lastWriter_) { device::Memory* dmem = getDeviceMemory(*lastWriter_); //! @note It's a special condition, when a subbuffer was created, //! but never used. Thus dev memory is still NULL and lastWriter_ //! was passed from the parent. if (NULL != dmem) { dmem->syncHostFromCache(); } } else if (isParent()) { // On CPU parent can't be synchronized, because lastWriter_ could be NULL // and syncHostFromCache() won't be called. for (uint i = 0; i < numDevices_; ++i) { deviceMemories_[i].value_->syncHostFromCache(); } } } void Memory::copyToBackingStore(void* initFrom) { memcpy(getHostMem(), initFrom, size_); } bool Memory::usesSvmPointer() const { if (!(flags_ & CL_MEM_USE_HOST_PTR)) { return false; } // If the application host pointer lies within a SVM region, so does the // sub-buffer host pointer - so the following check works in both cases return (SvmBuffer::malloced(getHostMem()) || NULL != svmHostAddress_); } void Memory::commitSvmMemory() { ScopedLock lock(lockMemoryOps_); if (!svmPtrCommited_) { amd::Os::commitMemory(svmHostAddress_, size_, amd::Os::MEM_PROT_RW); svmPtrCommited_ = true; } } void Memory::uncommitSvmMemory() { ScopedLock lock(lockMemoryOps_); if (svmPtrCommited_ && !(flags_ & CL_MEM_SVM_FINE_GRAIN_BUFFER)) { amd::Os::uncommitMemory(svmHostAddress_, size_); svmPtrCommited_ = false; } } void Buffer::initDeviceMemory() { deviceMemories_ = reinterpret_cast(reinterpret_cast(this) + sizeof(Buffer)); memset(deviceMemories_, 0, context_().devices().size() * sizeof(DeviceMemory)); } bool Buffer::create(void* initFrom, bool sysMemAlloc, bool skipAlloc) { if ((getMemFlags() & CL_MEM_EXTERNAL_PHYSICAL_AMD) && (initFrom != NULL)) { busAddress_ = *(reinterpret_cast(initFrom)); initFrom = NULL; } else { busAddress_.surface_bus_address = 0; busAddress_.marker_bus_address = 0; } return Memory::create(initFrom, sysMemAlloc, skipAlloc); } bool Buffer::isEntirelyCovered(const Coord3D& origin, const Coord3D& region) const { return ((origin[0] == 0) && (region[0] == getSize())) ? true : false; } bool Buffer::validateRegion(const Coord3D& origin, const Coord3D& region) const { return ((region[0] > 0) && (origin[0] < getSize()) && ((origin[0] + region[0]) <= getSize())) ? true : false; } void Pipe::initDeviceMemory() { deviceMemories_ = reinterpret_cast(reinterpret_cast(this) + sizeof(Pipe)); memset(deviceMemories_, 0, context_().devices().size() * sizeof(DeviceMemory)); } #define GETMIPDIM(dim, mip) (((dim >> mip) > 0) ? (dim >> mip) : 1) Image::Image(const Format& format, Image& parent, uint baseMipLevel, cl_mem_flags flags) : Memory(parent, flags, 0, parent.getWidth() * parent.getHeight() * parent.getDepth() * format.getElementSize()), impl_(format, Coord3D(parent.getWidth() * parent.getImageFormat().getElementSize() / format.getElementSize(), parent.getHeight(), parent.getDepth()), parent.getRowPitch(), parent.getSlicePitch(), parent.getBytePitch()), mipLevels_(1), baseMipLevel_(baseMipLevel) { if (baseMipLevel > 0) { impl_.region_.c[0] = GETMIPDIM(parent.getWidth(), baseMipLevel) * parent.getImageFormat().getElementSize() / format.getElementSize(); impl_.region_.c[1] = GETMIPDIM(parent.getHeight(), baseMipLevel); impl_.region_.c[2] = GETMIPDIM(parent.getDepth(), baseMipLevel); if (parent.getType() == CL_MEM_OBJECT_IMAGE1D_ARRAY) { impl_.region_.c[1] = parent.getHeight(); } else if (parent.getType() == CL_MEM_OBJECT_IMAGE2D_ARRAY) { impl_.region_.c[2] = parent.getDepth(); } size_ = getWidth() * getHeight() * parent.getDepth() * format.getElementSize(); } initDimension(); } Image::Image(Context& context, Type type, Flags flags, const Format& format, size_t width, size_t height, size_t depth, size_t rowPitch, size_t slicePitch, uint mipLevels) : Memory(context, type, flags, width * height * depth * format.getElementSize()), impl_(format, Coord3D(width, height, depth), rowPitch, slicePitch), mipLevels_(mipLevels), baseMipLevel_(0) { initDimension(); } Image::Image(Buffer& buffer, Type type, Flags flags, const Format& format, size_t width, size_t height, size_t depth, size_t rowPitch, size_t slicePitch) : Memory(buffer, flags, 0, buffer.getSize(), type), impl_(format, Coord3D(width, height, depth), rowPitch, slicePitch), mipLevels_(1), baseMipLevel_(0) { initDimension(); } bool Image::validateDimensions(const std::vector& devices, cl_mem_object_type type, size_t width, size_t height, size_t depth, size_t arraySize) { bool sizePass = false; switch (type) { case CL_MEM_OBJECT_IMAGE3D: if ((width == 0) || (height == 0) || (depth < 1)) { return false; } for (const auto& dev : devices) { if ((dev->info().image3DMaxWidth_ >= width) && (dev->info().image3DMaxHeight_ >= height) && (dev->info().image3DMaxDepth_ >= depth)) { return true; } } break; case CL_MEM_OBJECT_IMAGE2D_ARRAY: if (arraySize == 0) { return false; } for (const auto& dev : devices) { if (dev->info().imageMaxArraySize_ >= arraySize) { sizePass = true; break; } } if (!sizePass) { return false; } // Fall through... case CL_MEM_OBJECT_IMAGE2D: if ((width == 0) || (height == 0)) { return false; } for (const auto dev : devices) { if ((dev->info().image2DMaxHeight_ >= height) && (dev->info().image2DMaxWidth_ >= width)) { return true; } } break; case CL_MEM_OBJECT_IMAGE1D_ARRAY: if (arraySize == 0) { return false; } for (const auto& dev : devices) { if (dev->info().imageMaxArraySize_ >= arraySize) { sizePass = true; break; } } if (!sizePass) { return false; } // Fall through... case CL_MEM_OBJECT_IMAGE1D: if (width == 0) { return false; } for (const auto& dev : devices) { if (dev->info().image2DMaxWidth_ >= width) { return true; } } break; case CL_MEM_OBJECT_IMAGE1D_BUFFER: if (width == 0) { return false; } for (const auto& dev : devices) { if (dev->info().imageMaxBufferSize_ >= width) { return true; } } break; default: break; } return false; } void Image::initDimension() { const size_t elemSize = impl_.format_.getElementSize(); if (impl_.rp_ == 0) { impl_.rp_ = impl_.region_[0] * elemSize; } switch (type_) { case CL_MEM_OBJECT_IMAGE3D: case CL_MEM_OBJECT_IMAGE2D_ARRAY: dim_ = 3; if (impl_.sp_ == 0) { impl_.sp_ = impl_.region_[0] * impl_.region_[1] * elemSize; } break; case CL_MEM_OBJECT_IMAGE2D: case CL_MEM_OBJECT_IMAGE1D_ARRAY: dim_ = 2; if ((impl_.sp_ == 0) && (type_ == CL_MEM_OBJECT_IMAGE1D_ARRAY)) { impl_.sp_ = impl_.rp_; } break; case CL_MEM_OBJECT_IMAGE1D: case CL_MEM_OBJECT_IMAGE1D_BUFFER: default: dim_ = 1; break; } } void Image::initDeviceMemory() { deviceMemories_ = reinterpret_cast(reinterpret_cast(this) + sizeof(Image)); memset(deviceMemories_, 0, context_().devices().size() * sizeof(DeviceMemory)); } bool Image::create(void* initFrom) { return Memory::create(initFrom); } size_t Image::Format::getNumChannels() const { switch (image_channel_order) { case CL_RG: case CL_RA: return 2; case CL_RGB: case CL_sRGB: case CL_sRGBx: return 3; case CL_RGBA: case CL_BGRA: case CL_ARGB: case CL_sRGBA: case CL_sBGRA: return 4; } return 1; } size_t Image::Format::getElementSize() const { size_t bytesPerPixel = getNumChannels(); switch (image_channel_data_type) { case CL_SNORM_INT8: case CL_UNORM_INT8: case CL_SIGNED_INT8: case CL_UNSIGNED_INT8: break; case CL_UNORM_INT_101010: bytesPerPixel = 4; break; case CL_SIGNED_INT32: case CL_UNSIGNED_INT32: case CL_FLOAT: bytesPerPixel *= 4; break; default: bytesPerPixel *= 2; break; } return bytesPerPixel; } bool Image::Format::isValid() const { switch (image_channel_data_type) { case CL_SNORM_INT8: case CL_SNORM_INT16: case CL_UNORM_INT8: case CL_UNORM_INT16: case CL_UNORM_SHORT_565: case CL_UNORM_SHORT_555: case CL_UNORM_INT_101010: case CL_SIGNED_INT8: case CL_SIGNED_INT16: case CL_SIGNED_INT32: case CL_UNSIGNED_INT8: case CL_UNSIGNED_INT16: case CL_UNSIGNED_INT32: case CL_HALF_FLOAT: case CL_FLOAT: break; default: return false; } switch (image_channel_order) { case CL_R: case CL_A: case CL_RG: case CL_RA: case CL_RGBA: break; case CL_INTENSITY: case CL_LUMINANCE: switch (image_channel_data_type) { case CL_SNORM_INT8: case CL_SNORM_INT16: case CL_UNORM_INT8: case CL_UNORM_INT16: case CL_HALF_FLOAT: case CL_FLOAT: break; default: return false; } break; case CL_RGB: switch (image_channel_data_type) { case CL_UNORM_SHORT_565: case CL_UNORM_SHORT_555: case CL_UNORM_INT_101010: break; default: return false; } break; case CL_BGRA: case CL_ARGB: switch (image_channel_data_type) { case CL_SNORM_INT8: case CL_UNORM_INT8: case CL_SIGNED_INT8: case CL_UNSIGNED_INT8: break; default: return false; } break; case CL_sRGB: case CL_sRGBx: case CL_sRGBA: case CL_sBGRA: switch (image_channel_data_type) { case CL_UNORM_INT8: break; default: return false; } break; case CL_DEPTH: switch (image_channel_data_type) { case CL_UNORM_INT16: case CL_FLOAT: break; default: return false; } break; default: return false; } return true; } // definition of list of supported formats cl_image_format Image::supportedFormats[] = { // R {CL_R, CL_SNORM_INT8}, {CL_R, CL_SNORM_INT16}, {CL_R, CL_UNORM_INT8}, {CL_R, CL_UNORM_INT16}, {CL_R, CL_SIGNED_INT8}, {CL_R, CL_SIGNED_INT16}, {CL_R, CL_SIGNED_INT32}, {CL_R, CL_UNSIGNED_INT8}, {CL_R, CL_UNSIGNED_INT16}, {CL_R, CL_UNSIGNED_INT32}, {CL_R, CL_HALF_FLOAT}, {CL_R, CL_FLOAT}, // A {CL_A, CL_SNORM_INT8}, {CL_A, CL_SNORM_INT16}, {CL_A, CL_UNORM_INT8}, {CL_A, CL_UNORM_INT16}, {CL_A, CL_SIGNED_INT8}, {CL_A, CL_SIGNED_INT16}, {CL_A, CL_SIGNED_INT32}, {CL_A, CL_UNSIGNED_INT8}, {CL_A, CL_UNSIGNED_INT16}, {CL_A, CL_UNSIGNED_INT32}, {CL_A, CL_HALF_FLOAT}, {CL_A, CL_FLOAT}, // RG {CL_RG, CL_SNORM_INT8}, {CL_RG, CL_SNORM_INT16}, {CL_RG, CL_UNORM_INT8}, {CL_RG, CL_UNORM_INT16}, {CL_RG, CL_SIGNED_INT8}, {CL_RG, CL_SIGNED_INT16}, {CL_RG, CL_SIGNED_INT32}, {CL_RG, CL_UNSIGNED_INT8}, {CL_RG, CL_UNSIGNED_INT16}, {CL_RG, CL_UNSIGNED_INT32}, {CL_RG, CL_HALF_FLOAT}, {CL_RG, CL_FLOAT}, // RGBA {CL_RGBA, CL_SNORM_INT8}, {CL_RGBA, CL_SNORM_INT16}, {CL_RGBA, CL_UNORM_INT8}, {CL_RGBA, CL_UNORM_INT16}, {CL_RGBA, CL_SIGNED_INT8}, {CL_RGBA, CL_SIGNED_INT16}, {CL_RGBA, CL_SIGNED_INT32}, {CL_RGBA, CL_UNSIGNED_INT8}, {CL_RGBA, CL_UNSIGNED_INT16}, {CL_RGBA, CL_UNSIGNED_INT32}, {CL_RGBA, CL_HALF_FLOAT}, {CL_RGBA, CL_FLOAT}, // ARGB {CL_ARGB, CL_SNORM_INT8}, {CL_ARGB, CL_UNORM_INT8}, {CL_ARGB, CL_SIGNED_INT8}, {CL_ARGB, CL_UNSIGNED_INT8}, // BGRA {CL_BGRA, CL_SNORM_INT8}, {CL_BGRA, CL_UNORM_INT8}, {CL_BGRA, CL_SIGNED_INT8}, {CL_BGRA, CL_UNSIGNED_INT8}, // LUMINANCE {CL_LUMINANCE, CL_SNORM_INT8}, {CL_LUMINANCE, CL_SNORM_INT16}, {CL_LUMINANCE, CL_UNORM_INT8}, {CL_LUMINANCE, CL_UNORM_INT16}, {CL_LUMINANCE, CL_HALF_FLOAT}, {CL_LUMINANCE, CL_FLOAT}, // INTENSITY {CL_INTENSITY, CL_SNORM_INT8}, {CL_INTENSITY, CL_SNORM_INT16}, {CL_INTENSITY, CL_UNORM_INT8}, {CL_INTENSITY, CL_UNORM_INT16}, {CL_INTENSITY, CL_HALF_FLOAT}, {CL_INTENSITY, CL_FLOAT}, // RGB {CL_RGB, CL_UNORM_INT_101010}, // sRGB {CL_sRGBA, CL_UNORM_INT8}, // DEPTH {CL_DEPTH, CL_UNORM_INT16}, {CL_DEPTH, CL_FLOAT}, }; const cl_uint NUM_CHANNEL_ORDER_OF_RGB = 1; // The number of channel orders of RGB at the end of // the table supportedFormats above and before sRGB and // depth. const cl_uint NUM_CHANNEL_ORDER_OF_sRGB = 1; // The number of channel orders of sRGB at the end of // the table supportedFormats above and before depth. const cl_uint NUM_CHANNEL_ORDER_OF_DEPTH = 2; // The number of channel orders of DEPTH at the end of the table supportedFormats above. // definition of list of supported RA formats cl_image_format Image::supportedFormatsRA[] = { {CL_RA, CL_SNORM_INT8}, {CL_RA, CL_SNORM_INT16}, {CL_RA, CL_UNORM_INT8}, {CL_RA, CL_UNORM_INT16}, {CL_RA, CL_SIGNED_INT8}, {CL_RA, CL_SIGNED_INT16}, {CL_RA, CL_SIGNED_INT32}, {CL_RA, CL_UNSIGNED_INT8}, {CL_RA, CL_UNSIGNED_INT16}, {CL_RA, CL_UNSIGNED_INT32}, {CL_RA, CL_HALF_FLOAT}, {CL_RA, CL_FLOAT}, }; cl_image_format Image::supportedDepthStencilFormats[] = { // DEPTH STENCIL {CL_DEPTH_STENCIL, CL_FLOAT}, {CL_DEPTH_STENCIL, CL_UNORM_INT24}}; cl_uint Image::numSupportedFormats(const Context& context, cl_mem_object_type image_type, cl_mem_flags flags) { const std::vector& devices = context.devices(); uint numFormats = sizeof(supportedFormats) / sizeof(cl_image_format); bool supportRA = false; bool supportDepthsRGB = false; bool supportDepthStencil = false; // Add RA if RA is supported. for (uint i = 0; i < devices.size(); i++) { if (devices[i]->settings().supportRA_) { supportRA = true; } if (devices[i]->settings().supportDepthsRGB_) { supportDepthsRGB = true; } if (devices[i]->settings().checkExtension(ClKhrGLDepthImages) && (context.info().flags_ & Context::GLDeviceKhr)) { supportDepthStencil = true; } } if (supportDepthsRGB) { if ((image_type != CL_MEM_OBJECT_IMAGE2D) && (image_type != CL_MEM_OBJECT_IMAGE2D_ARRAY) && (image_type != 0)) { numFormats -= NUM_CHANNEL_ORDER_OF_DEPTH; // substract channel order of DEPTH type. } // Currently we are not supported sRGB for write_imagef (extension cl_khr_srgb_image_writes) if ((image_type == CL_MEM_OBJECT_IMAGE1D_BUFFER) || ((flags & (CL_MEM_WRITE_ONLY | CL_MEM_READ_WRITE | CL_MEM_KERNEL_READ_AND_WRITE)) != 0)) { numFormats -= NUM_CHANNEL_ORDER_OF_sRGB; } } else { numFormats -= NUM_CHANNEL_ORDER_OF_RGB; // substract channel order of RGB type. numFormats -= NUM_CHANNEL_ORDER_OF_sRGB; // substract channel order of sRGB type. numFormats -= NUM_CHANNEL_ORDER_OF_DEPTH; // substract channel order of DEPTH type. } // Add RA if RA is supported. RA isn't supported on SI. if (supportRA) { numFormats += sizeof(supportedFormatsRA) / sizeof(cl_image_format); // Add channel order of RA type. } if (supportDepthStencil) { if (flags & CL_MEM_READ_ONLY) { numFormats += sizeof(supportedDepthStencilFormats) / sizeof(cl_image_format); } } return numFormats; } cl_uint Image::getSupportedFormats(const Context& context, cl_mem_object_type image_type, const cl_uint num_entries, cl_image_format* image_formats, cl_mem_flags flags) { const std::vector& devices = context.devices(); uint numFormats = 0; bool supportRA = false; bool supportDepthsRGB = false; bool supportDepthStencil = false; // Add RA if RA is supported. for (uint i = 0; i < devices.size(); i++) { if (devices[i]->settings().supportRA_) { supportRA = true; } if (devices[i]->settings().supportDepthsRGB_) { supportDepthsRGB = true; } if (devices[i]->settings().checkExtension(ClKhrGLDepthImages) && (context.info().flags_ & Context::GLDeviceKhr)) { supportDepthStencil = true; } } cl_image_format* format = image_formats; uint numSupportedFormats = sizeof(supportedFormats) / sizeof(cl_image_format); bool srgbWriteSupported = true; if (supportDepthsRGB) { if ((image_type != CL_MEM_OBJECT_IMAGE2D) && (image_type != CL_MEM_OBJECT_IMAGE2D_ARRAY) && (image_type != 0)) { numSupportedFormats -= NUM_CHANNEL_ORDER_OF_DEPTH; } // Currently we are not supported sRGB for write_imagef (extension cl_khr_srgb_image_writes) if ((image_type == CL_MEM_OBJECT_IMAGE1D_BUFFER) || ((flags & (CL_MEM_WRITE_ONLY | CL_MEM_READ_WRITE | CL_MEM_KERNEL_READ_AND_WRITE)) != 0)) { srgbWriteSupported = false; } } else { numSupportedFormats -= NUM_CHANNEL_ORDER_OF_RGB; // substract channel order of RGB type. numSupportedFormats -= NUM_CHANNEL_ORDER_OF_sRGB; // substract channel order of sRGB type. numSupportedFormats -= NUM_CHANNEL_ORDER_OF_DEPTH; // substract channel order of DEPTH type. } for (uint i = 0; i < numSupportedFormats; i++) { if (numFormats == num_entries) { break; } if (!srgbWriteSupported) { if ((amd::Image::supportedFormats[i].image_channel_order == CL_sRGBA) || (amd::Image::supportedFormats[i].image_channel_order == CL_sRGB) || (amd::Image::supportedFormats[i].image_channel_order == CL_sRGBx) || (amd::Image::supportedFormats[i].image_channel_order == CL_sBGRA)) { continue; } } *format++ = amd::Image::supportedFormats[i]; numFormats++; } // Add RA if RA is supported. if (supportRA) { for (uint i = 0; i < sizeof(supportedFormatsRA) / sizeof(cl_image_format); i++) { if (numFormats == num_entries) { break; } *format++ = amd::Image::supportedFormatsRA[i]; numFormats++; } } if (supportDepthStencil) { if (flags & CL_MEM_READ_ONLY) { for (uint i = 0; i < sizeof(supportedDepthStencilFormats) / sizeof(cl_image_format); i++) { if (numFormats == num_entries) { break; } *format++ = amd::Image::supportedDepthStencilFormats[i]; numFormats++; } } } return numFormats; } bool Image::Format::isSupported(const Context& context, cl_mem_object_type image_type, cl_mem_flags flags) const { const cl_image_format RGBA10 = {CL_RGBA, CL_UNORM_INT_101010}; uint numFormats = numSupportedFormats(context, image_type, flags); std::vector image_formats(numFormats); getSupportedFormats(context, image_type, numFormats, image_formats.data(), flags); for (uint i = 0; i < numFormats; i++) { if (*this == image_formats[i]) { return true; } } if (*this == RGBA10) { return true; } return false; } Image* Image::createView(const Context& context, const Format& format, device::VirtualDevice* vDev, uint baseMipLevel, cl_mem_flags flags) { Image* view = NULL; // Find the image dimensions and create a corresponding object view = new (context) Image(format, *this, baseMipLevel, flags); // Set GPU virtual device for this view view->setVirtualDevice(vDev); if (view != NULL) { // Initialize view view->initDeviceMemory(); } return view; } bool Image::isEntirelyCovered(const Coord3D& origin, const Coord3D& region) const { return (origin[0] == 0 && origin[1] == 0 && origin[2] == 0 && region[0] == getWidth() && region[1] == getHeight() && region[2] == getDepth()) ? true : false; } bool Image::validateRegion(const Coord3D& origin, const Coord3D& region) const { return ((region[0] > 0) && (region[1] > 0) && (region[2] > 0) && (origin[0] < getWidth()) && (region[0] != 0) && (origin[1] < getHeight()) && (region[1] != 0) && (origin[2] < getDepth()) && (region[2] != 0) && ((origin[0] + region[0]) <= getWidth()) && ((origin[1] + region[1]) <= getHeight()) && ((origin[2] + region[2]) <= getDepth())) ? true : false; } bool Image::isRowSliceValid(size_t rowPitch, size_t slice, size_t width, size_t height) const { size_t tmpHeight = (getType() == CL_MEM_OBJECT_IMAGE1D_ARRAY) ? 1 : height; bool valid = (rowPitch == 0) || ((rowPitch != 0) && (rowPitch >= width * getImageFormat().getElementSize())); return ((slice == 0) || ((slice != 0) && (slice >= rowPitch * tmpHeight))) ? valid : false; } void Image::copyToBackingStore(void* initFrom) { char* src; char* dst = reinterpret_cast(getHostMem()); size_t cpySize = getWidth() * getImageFormat().getElementSize(); for (uint z = 0; z < getDepth(); ++z) { src = reinterpret_cast(initFrom) + z * getSlicePitch(); for (uint y = 0; y < getHeight(); ++y) { memcpy(dst, src, cpySize); dst += cpySize; src += getRowPitch(); } } impl_.rp_ = cpySize; if (impl_.sp_ != 0) { impl_.sp_ = impl_.rp_; if (getDims() == 3) { impl_.sp_ *= getHeight(); } } } static int round_to_even(float v) { // clamp overflow if (v >= -(float)std::numeric_limits::min()) { return std::numeric_limits::max(); } if (v <= (float)std::numeric_limits::min()) { return std::numeric_limits::min(); } static const unsigned int magic[2] = {0x4b000000u, 0xcb000000u}; // round fractional values to integer value if (fabsf(v) < *reinterpret_cast(&magic[0])) { float magicVal = *reinterpret_cast(&magic[v < 0.0f]); v += magicVal; v -= magicVal; } return static_cast(v); } static uint16_t float2half_rtz(float f) { union { float f; cl_uint u; } u = {f}; cl_uint sign = (u.u >> 16) & 0x8000; float x = fabsf(f); // Nan if (x != x) { u.u >>= (24 - 11); u.u &= 0x7fff; u.u |= 0x0200; // silence the NaN return u.u | sign; } int values[5] = {0x47800000, 0x33800000, 0x38800000, 0x4b800000, 0x7f800000}; // overflow if (x >= *reinterpret_cast(&values[0])) { if (x == *reinterpret_cast(&values[4])) { return 0x7c00 | sign; } return 0x7bff | sign; } // underflow if (x < *reinterpret_cast(&values[1])) { return sign; // The halfway case can return 0x0001 or 0. 0 is even. } // half denormal if (x < *reinterpret_cast(&values[2])) { x *= *reinterpret_cast(&values[3]); return static_cast((int)x | sign); } u.u &= 0xFFFFE000U; u.u -= 0x38000000U; return (u.u >> (24 - 11)) | sign; } void Image::Format::getChannelOrder(uint8_t* channelOrder) const { enum { CH_ORDER_R = 0, CH_ORDER_G, CH_ORDER_B, CH_ORDER_A }; switch (image_channel_order) { case CL_A: channelOrder[0] = CH_ORDER_A; break; case CL_RA: channelOrder[0] = CH_ORDER_R; channelOrder[1] = CH_ORDER_A; break; case CL_BGRA: channelOrder[0] = CH_ORDER_B; channelOrder[1] = CH_ORDER_G; channelOrder[2] = CH_ORDER_R; channelOrder[3] = CH_ORDER_A; break; case CL_ARGB: channelOrder[0] = CH_ORDER_A; channelOrder[1] = CH_ORDER_R; channelOrder[2] = CH_ORDER_G; channelOrder[3] = CH_ORDER_B; break; default: channelOrder[0] = CH_ORDER_R; channelOrder[1] = CH_ORDER_G; channelOrder[2] = CH_ORDER_B; channelOrder[3] = CH_ORDER_A; break; } } // "colorRGBA" is a four component RGBA floating-point color value if the image // channel data type is not an unnormalized signed and unsigned integer type, // is a four component signed integer value if the image channel data type is // an unnormalized signed integer type and is a four component unsigned integer // value if the image channel data type is an unormalized unsigned integer type. void Image::Format::formatColor(const void* colorRGBA, void* colorFormat) const { union t565 { struct { uint16_t r_ : 5; uint16_t g_ : 6; uint16_t b_ : 5; }; uint16_t rgba_; }; union t555 { struct { uint16_t r_ : 5; uint16_t g_ : 5; uint16_t b_ : 5; uint16_t a_ : 1; }; uint16_t rgba_; }; union t101010 { struct { uint32_t b_ : 10; uint32_t g_ : 10; uint32_t r_ : 10; uint32_t a_ : 2; }; uint32_t rgba_; }; const float* colorRGBAf = reinterpret_cast(colorRGBA); const int32_t* colorRGBAi = reinterpret_cast(colorRGBA); const uint32_t* colorRGBAui = reinterpret_cast(colorRGBA); size_t chCount = getNumChannels(); uint8_t chOrder[4]; getChannelOrder(chOrder); bool allChannels = false; for (size_t i = 0; i < chCount && !allChannels; ++i) { switch (image_channel_data_type) { case CL_SNORM_INT8: { int8_t* color = reinterpret_cast(colorFormat); color[i] = round_to_even(INT8_MAX * colorRGBAf[chOrder[i]]); } break; case CL_SNORM_INT16: { int16_t* color = reinterpret_cast(colorFormat); color[i] = round_to_even(INT16_MAX * colorRGBAf[chOrder[i]]); } break; case CL_UNORM_INT8: { uint8_t* color = reinterpret_cast(colorFormat); color[i] = round_to_even(UINT8_MAX * colorRGBAf[chOrder[i]]); } break; case CL_UNORM_INT16: { uint16_t* color = reinterpret_cast(colorFormat); color[i] = round_to_even(UINT16_MAX * colorRGBAf[chOrder[i]]); } break; case CL_UNORM_SHORT_565: { t565* color = reinterpret_cast(colorFormat); color->r_ = round_to_even(0x1F * colorRGBAf[0]); color->g_ = round_to_even(0x3F * colorRGBAf[1]); color->b_ = round_to_even(0x1F * colorRGBAf[2]); allChannels = true; } break; case CL_UNORM_SHORT_555: { t555* color = reinterpret_cast(colorFormat); color->r_ = round_to_even(0x1F * colorRGBAf[0]); color->g_ = round_to_even(0x1F * colorRGBAf[1]); color->b_ = round_to_even(0x1F * colorRGBAf[2]); color->a_ = round_to_even(colorRGBAf[3]); allChannels = true; } break; case CL_UNORM_INT_101010: { t101010* color = reinterpret_cast(colorFormat); color->r_ = round_to_even(0x3FF * colorRGBAf[0]); color->g_ = round_to_even(0x3FF * colorRGBAf[1]); color->b_ = round_to_even(0x3FF * colorRGBAf[2]); color->a_ = round_to_even(0x3 * colorRGBAf[3]); allChannels = true; } break; case CL_SIGNED_INT8: { int8_t* color = reinterpret_cast(colorFormat); color[i] = colorRGBAi[chOrder[i]]; } break; case CL_SIGNED_INT16: { int16_t* color = reinterpret_cast(colorFormat); color[i] = colorRGBAi[chOrder[i]]; } break; case CL_SIGNED_INT32: { int32_t* color = reinterpret_cast(colorFormat); color[i] = colorRGBAi[chOrder[i]]; } break; case CL_UNSIGNED_INT8: { uint8_t* color = reinterpret_cast(colorFormat); color[i] = colorRGBAui[chOrder[i]]; } break; case CL_UNSIGNED_INT16: { uint16_t* color = reinterpret_cast(colorFormat); color[i] = colorRGBAui[chOrder[i]]; } break; case CL_UNSIGNED_INT32: { uint32_t* color = reinterpret_cast(colorFormat); color[i] = colorRGBAui[chOrder[i]]; } break; case CL_HALF_FLOAT: { uint16_t* color = reinterpret_cast(colorFormat); color[i] = float2half_rtz(colorRGBAf[chOrder[i]]); } break; case CL_FLOAT: { float* color = reinterpret_cast(colorFormat); color[i] = colorRGBAf[chOrder[i]]; } break; } } } std::map SvmBuffer::Allocated_; Monitor SvmBuffer::AllocatedLock_("Guards SVM allocation list"); void SvmBuffer::Add(uintptr_t k, uintptr_t v) { ScopedLock lock(AllocatedLock_); Allocated_.insert(std::pair(k, v)); } void SvmBuffer::Remove(uintptr_t k) { ScopedLock lock(AllocatedLock_); Allocated_.erase(k); } bool SvmBuffer::Contains(uintptr_t ptr) { ScopedLock lock(AllocatedLock_); auto it = Allocated_.upper_bound(ptr); if (it == Allocated_.begin()) { return false; } --it; return ptr >= it->first && ptr < it->second; } // The allocation flags are ignored for now. void* SvmBuffer::malloc(Context& context, cl_svm_mem_flags flags, size_t size, size_t alignment) { bool atomics = (flags & CL_MEM_SVM_ATOMICS) != 0; void* ret = context.svmAlloc(size, alignment, flags); if (ret == NULL) { LogError("Unable to allocate aligned memory"); return NULL; } uintptr_t ret_u = reinterpret_cast(ret); Add(ret_u, ret_u + size); return ret; } void SvmBuffer::free(const Context& context, void* ptr) { Remove(reinterpret_cast(ptr)); context.svmFree(ptr); } void SvmBuffer::memFill(void* dst, const void* src, size_t srcSize, size_t times) { address dstAddress = reinterpret_cast
(dst); const_address srcAddress = reinterpret_cast(src); for (size_t i = 0; i < times; i++) { ::memcpy(dstAddress + i * srcSize, srcAddress, srcSize); } } bool SvmBuffer::malloced(const void* ptr) { return Contains(reinterpret_cast(ptr)); } } // namespace amd