6d464be252
ECR #304775 - Make optimization for read map of USWC memory - If runtime detects USWC map with read operation, then it will switch to indirect map. This should improve map-read performance on APU(s) when USWC memory is used instead of frame buffer Affected files ... ... //depot/stg/opencl/drivers/opencl/api/opencl/amdocl/cl_memobj.cpp#72 edit ... //depot/stg/opencl/drivers/opencl/api/opencl/amdocl/cl_svm.cpp#8 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/cpu/cpudevice.cpp#269 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/cpu/cpudevice.hpp#89 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/device.cpp#172 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/device.hpp#234 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudevice.cpp#486 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudevice.hpp#134 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpumemory.cpp#112 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpumemory.hpp#43 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuvirtual.cpp#340 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/hsa/hsadevice.cpp#88 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/hsa/hsadevice.hpp#45 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/hsa/hsamemory.cpp#42 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/hsa/hsamemory.hpp#27 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/hsa/hsavirtual.cpp#98 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/hsa_foundation/hsadevice.cpp#21 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/hsa_foundation/hsadevice.hpp#7 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/hsa_foundation/hsamemory.cpp#6 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/hsa_foundation/hsamemory.hpp#5 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/hsa_foundation/hsavirtual.cpp#26 edit
939 lines
32 KiB
C++
939 lines
32 KiB
C++
//
|
|
// Copyright (c) 2008 Advanced Micro Devices, Inc. All rights reserved.
|
|
//
|
|
|
|
#ifndef WITHOUT_FSA_BACKEND
|
|
|
|
#include "CL/cl_ext.h"
|
|
|
|
#include "device/device.hpp"
|
|
#include "device/hsa/hsamemory.hpp"
|
|
#include "device/hsa/hsadevice.hpp"
|
|
#include "device/hsa/hsablit.hpp"
|
|
#include "device/hsa/oclhsa_common.hpp"
|
|
#include "thread/monitor.hpp"
|
|
#include "platform/memory.hpp"
|
|
#include "platform/sampler.hpp"
|
|
|
|
namespace oclhsa {
|
|
|
|
/////////////////////////////////oclhsa::Memory//////////////////////////////
|
|
Memory::Memory(const oclhsa::Device &dev, amd::Memory &owner)
|
|
: device::Memory(owner),
|
|
dev_(dev),
|
|
deviceMemory_(NULL),
|
|
interopType_(InteropNone)
|
|
{
|
|
}
|
|
|
|
Memory::~Memory()
|
|
{}
|
|
|
|
bool
|
|
Memory::allocateMapMemory(size_t allocationSize)
|
|
{
|
|
assert(mapMemory_ == NULL);
|
|
|
|
void *mapData = NULL;
|
|
|
|
// Use/reuse system memory from HSA system memory pool as backing
|
|
// storage of the map target.
|
|
if (kHsaStatusSuccess !=
|
|
servicesapi->HsaAllocateSystemMemory(
|
|
owner()->getSize(), 0, kHsaSystemMemoryTypeDefault, &mapData)) {
|
|
LogError("[OCL] Fail to allocate the backing storage for map target");
|
|
return false;
|
|
}
|
|
|
|
// Create buffer object to contain the map target.
|
|
amd::Memory *mapMemory =
|
|
new(owner()->getContext()) amd::Buffer(
|
|
owner()->getContext(), CL_MEM_USE_HOST_PTR, owner()->getSize());
|
|
|
|
if ((mapMemory == NULL) || (!mapMemory->create(mapData))) {
|
|
LogError("[OCL] Fail to allocate map target object");
|
|
servicesapi->HsaFreeSystemMemory(mapData);
|
|
if (mapMemory) {
|
|
mapMemory->release();
|
|
}
|
|
return false;
|
|
}
|
|
|
|
mapMemory_ = mapMemory;
|
|
|
|
return true;
|
|
}
|
|
|
|
void
|
|
Memory::freeMapMemory()
|
|
{
|
|
// Return the memory to HSA system memory pool.
|
|
assert(mapMemory_ != NULL);
|
|
servicesapi->HsaFreeSystemMemory(mapMemory_->getHostMem());
|
|
|
|
// Release the buffer object containing the map data.
|
|
mapMemory_->release();
|
|
mapMemory_ = NULL;
|
|
}
|
|
|
|
void *
|
|
Memory::allocMapTarget(const amd::Coord3D &origin,
|
|
const amd::Coord3D ®ion,
|
|
uint mapFlags,
|
|
size_t *rowPitch,
|
|
size_t *slicePitch)
|
|
{
|
|
// Map/Unmap must be serialized.
|
|
amd::ScopedLock lock(owner()->lockMemoryOps());
|
|
|
|
incIndMapCount();
|
|
|
|
// If the device backing storage is direct accessible, use it.
|
|
if (isHostMemDirectAccess()) {
|
|
return (static_cast<char *>(deviceMemory_) + origin[0]);
|
|
}
|
|
|
|
// Otherwise, check for host memory.
|
|
void *hostMem = owner()->getHostMem();
|
|
if (hostMem != NULL) {
|
|
return (static_cast<char *>(hostMem) + origin[0]);
|
|
}
|
|
|
|
// Allocate one if needed.
|
|
if (indirectMapCount_ == 1) {
|
|
if (!allocateMapMemory(owner()->getSize())) {
|
|
decIndMapCount();
|
|
return NULL;
|
|
}
|
|
}
|
|
else {
|
|
// Did the map resource allocation fail?
|
|
if (mapMemory_ == NULL) {
|
|
LogError("Could not map target resource");
|
|
return NULL;
|
|
}
|
|
}
|
|
|
|
return (static_cast<char *>(mapMemory_->getHostMem()) + origin[0]);
|
|
}
|
|
|
|
void
|
|
Memory::decIndMapCount()
|
|
{
|
|
// Map/Unmap must be serialized.
|
|
amd::ScopedLock lock(owner()->lockMemoryOps());
|
|
|
|
if (indirectMapCount_ == 0) {
|
|
LogError("decIndMapCount() called when indirectMapCount_ already zero");
|
|
return;
|
|
}
|
|
|
|
// Decrement the counter and release indirect map if it's the last op
|
|
if (--indirectMapCount_ == 0 &&
|
|
mapMemory_ != NULL) {
|
|
freeMapMemory();
|
|
}
|
|
}
|
|
|
|
void *
|
|
Memory::cpuMap(
|
|
device::VirtualDevice& vDev,
|
|
uint flags,
|
|
uint startLayer,
|
|
uint numLayers,
|
|
size_t* rowPitch,
|
|
size_t* slicePitch
|
|
)
|
|
{
|
|
// Create the map target.
|
|
void * mapTarget =
|
|
allocMapTarget(amd::Coord3D(0), amd::Coord3D(0), 0, rowPitch, slicePitch);
|
|
|
|
// Sync to map target if no direct access.
|
|
if (!isHostMemDirectAccess()) {
|
|
if (!vDev.blitMgr().readBuffer(
|
|
*this, mapTarget, amd::Coord3D(0), amd::Coord3D(size()), true)) {
|
|
decIndMapCount();
|
|
return NULL;
|
|
}
|
|
}
|
|
|
|
return mapTarget;
|
|
}
|
|
|
|
void
|
|
Memory::cpuUnmap(device::VirtualDevice& vDev)
|
|
{
|
|
// Sync to device backing storage if no direct access.
|
|
if (!isHostMemDirectAccess()) {
|
|
if (!vDev.blitMgr().writeBuffer(
|
|
mapMemory_->getHostMem(), *this, amd::Coord3D(0),
|
|
amd::Coord3D(size()), true)) {
|
|
LogError("[OCL] Fail sync the device memory on cpuUnmap");
|
|
}
|
|
}
|
|
|
|
decIndMapCount();
|
|
}
|
|
|
|
void Memory::destroyInterop()
|
|
{
|
|
HsaStatus status;
|
|
#ifdef _WIN32
|
|
if (interopType_ == InteropD3D10) {
|
|
HsaStatus status = hsacoreapi->HsaUnmapD3D10Resource(
|
|
dev_.getBackendDevice(), d3d10Resource_);
|
|
if (status != kHsaStatusSuccess) {
|
|
LogError("[OCL] Fail on HsaUnmapD3D10Resource");
|
|
return;
|
|
}
|
|
}
|
|
|
|
else if (interopType_ == InteropD3D11) {
|
|
HsaStatus status = hsacoreapi->HsaUnmapD3D11Resource(
|
|
dev_.getBackendDevice(), d3d11Resource_);
|
|
if (status != kHsaStatusSuccess) {
|
|
LogError("[OCL] Fail on HsaUnmapD3D11Resource");
|
|
return;
|
|
}
|
|
}
|
|
#endif
|
|
|
|
if (interopType_ == InteropGL) {
|
|
void * glContext =owner()->getContext().info().hCtx_;
|
|
status = hsacoreapi->HsaReleaseGLResources( dev_.getBackendDevice(),
|
|
glContext,
|
|
&glResource_,
|
|
1);
|
|
if (kHsaStatusSuccess != status) {
|
|
LogError("[OCL] Fail on HsaReleaseGLResources");
|
|
}
|
|
|
|
status = hsacoreapi->HsaUnmapGLResource(
|
|
dev_.getBackendDevice(), glContext, &glResource_);
|
|
|
|
if (status != kHsaStatusSuccess) {
|
|
LogError("[OCL] Fail on HsaUnmapGLResource");
|
|
return;
|
|
}
|
|
}
|
|
}
|
|
|
|
bool
|
|
Memory::isHsaLocalMemory() const {
|
|
if (owner()->isInterop()) {
|
|
return true;
|
|
}
|
|
else {
|
|
if (amd::Is64Bits()) {
|
|
uint64_t addr = reinterpret_cast<uint64_t>(deviceMemory_);
|
|
|
|
// Fast check: in 64 bits, CPU can only access the high area
|
|
// (VA[63:47] == 0x1FFFF) and low area (VA[63:47 == 0).
|
|
// Reference: GFXIP7_ShaderIO_Delt.doc
|
|
addr >>= 47; // discard least significant 47 bits
|
|
return (addr != 0x1FFFF && addr != 0);
|
|
}
|
|
else {
|
|
const HsaMemoryDescriptor &memDesc =
|
|
dev_.getBackendDevice()->memory_descriptors[0];
|
|
|
|
if (memDesc.heap_type == kHsaHeapTypeFrameBufferPrivate) {
|
|
const uintptr_t addr =
|
|
reinterpret_cast<uintptr_t>(deviceMemory_);
|
|
const uintptr_t gpuvmBase = memDesc.virtual_base_address;
|
|
const size_t size = memDesc.size_in_bytes;
|
|
return (addr >= gpuvmBase && addr < (gpuvmBase + size));
|
|
}
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
/////////////////////////////////oclhsa::Buffer//////////////////////////////
|
|
|
|
Buffer::Buffer(const oclhsa::Device &dev, amd::Memory &owner)
|
|
: oclhsa::Memory(dev, owner)
|
|
{}
|
|
|
|
Buffer::~Buffer()
|
|
{
|
|
destroy();
|
|
}
|
|
|
|
void
|
|
Buffer::destroy()
|
|
{
|
|
if (owner()->parent() != NULL) {
|
|
return;
|
|
}
|
|
|
|
if (owner()->isInterop()) {
|
|
destroyInterop();
|
|
return;
|
|
}
|
|
|
|
if (isHostMemoryRegistered()) {
|
|
hsacoreapi->HsaDeregisterSystemMemory(deviceMemory_);
|
|
}
|
|
else {
|
|
if (!isHostMemDirectAccess()) {
|
|
hsacoreapi->HsaFreeDeviceMemory(deviceMemory_);
|
|
}
|
|
else if (deviceMemory_ != owner()->getHostMem()) {
|
|
// if they are identical, the host pointer will be
|
|
// deallocated later on => avoid double deallocation
|
|
hsacoreapi->HsaAmdFreeSystemMemory(deviceMemory_);
|
|
}
|
|
}
|
|
}
|
|
|
|
bool Buffer::createInterop()
|
|
{
|
|
amd::InteropObject *interopObject = owner()->getInteropObj();
|
|
|
|
#ifdef _WIN32
|
|
if (interopObject->asD3D10Object() != NULL) {
|
|
amd::D3D10Object *d3d10Object = interopObject->asD3D10Object();
|
|
// 1. Get the D3D11 resource
|
|
ID3D10Resource *resource = d3d10Object->getD3D10Resource();
|
|
ID3D10Buffer *d3d10Buffer = static_cast<ID3D10Buffer *>(resource);
|
|
|
|
HsaStatus status = hsacoreapi->HsaMapD3D10Buffer(
|
|
dev_.getBackendDevice(), d3d10Buffer, &deviceMemory_);
|
|
if (status != kHsaStatusSuccess) {
|
|
LogError("[OCL] Fail on HsaMapD3D10Buffer");
|
|
return false;
|
|
}
|
|
interopType_ = InteropD3D10;
|
|
d3d10Resource_ = d3d10Buffer;
|
|
}
|
|
|
|
if (interopObject->asD3D11Object() != NULL) {
|
|
amd::D3D11Object *d3d11Object = interopObject->asD3D11Object();
|
|
// 1. Get the D3D11 resource
|
|
ID3D11Resource *resource = d3d11Object->getD3D11Resource();
|
|
ID3D11Buffer *d3d11Buffer = static_cast<ID3D11Buffer *>(resource);
|
|
|
|
HsaStatus status = hsacoreapi->HsaMapD3D11Buffer(
|
|
dev_.getBackendDevice(), d3d11Buffer, &deviceMemory_);
|
|
if (status != kHsaStatusSuccess) {
|
|
LogError("[OCL] Fail on HsaMapD3D10Buffer");
|
|
return false;
|
|
}
|
|
interopType_ = InteropD3D11;
|
|
d3d11Resource_ = d3d11Buffer;
|
|
}
|
|
#endif
|
|
|
|
if (interopObject->asBufferGL()) {
|
|
amd::BufferGL *buffer_gl = interopObject->asBufferGL();
|
|
HsaGLResource gl_resource = {0};
|
|
gl_resource.name = buffer_gl->getGLName();
|
|
gl_resource.type = buffer_gl->getGLInternalFormat();
|
|
|
|
void * glContext =owner()->getContext().info().hCtx_;
|
|
HsaStatus status = hsacoreapi->HsaMapGLBuffer(
|
|
dev_.getBackendDevice(), glContext, &gl_resource, &deviceMemory_);
|
|
if (status != kHsaStatusSuccess) {
|
|
LogError("[OCL] Fail on HsaMapGLBuffer");
|
|
return false;
|
|
}
|
|
|
|
status = hsacoreapi->HsaAcquireGLResources( dev_.getBackendDevice(),
|
|
glContext,
|
|
&gl_resource,
|
|
1);
|
|
|
|
if (status != kHsaStatusSuccess) {
|
|
LogError("[OCL] Fail on HsaAcquireGLResources");
|
|
return false;
|
|
}
|
|
interopType_ = InteropGL;
|
|
glResource_ = gl_resource;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
bool
|
|
Buffer::create()
|
|
{
|
|
if (owner()->parent()) {
|
|
// Sub-Buffer creation.
|
|
oclhsa::Memory *parentBuffer =
|
|
static_cast<oclhsa::Memory *>(owner()->parent()->getDeviceMemory(dev_));
|
|
|
|
if (parentBuffer == NULL) {
|
|
LogError("[OCL] Fail to allocate parent buffer");
|
|
return false;
|
|
}
|
|
|
|
const size_t offset = owner()->getOrigin();
|
|
deviceMemory_ =
|
|
static_cast<char *>(parentBuffer->getDeviceMemory()) + offset;
|
|
|
|
void* parentHostPtr = parentBuffer->owner()->getHostMem();
|
|
if (parentHostPtr) {
|
|
owner()->setHostMem(static_cast<char *>(parentHostPtr) + offset);
|
|
}
|
|
|
|
flags_ |= owner()->parent()->getMemFlags();
|
|
return true;
|
|
}
|
|
|
|
// Allocate backing storage in device local memory unless UHP or AHP are set
|
|
const cl_mem_flags memFlags = owner()->getMemFlags();
|
|
if (!(memFlags & (CL_MEM_USE_HOST_PTR | CL_MEM_ALLOC_HOST_PTR))) {
|
|
bool useDeviceMemory = dev_.settings().enableLocalMemory_;
|
|
size_t alignment = static_cast<size_t>(dev_.info().memBaseAddrAlign_);
|
|
if (useDeviceMemory) {
|
|
hsacoreapi->HsaAllocateDeviceMemory(
|
|
size(), alignment, dev_.getBackendDevice(), &deviceMemory_);
|
|
if (deviceMemory_ && (memFlags & CL_MEM_COPY_HOST_PTR)) {
|
|
bool ret = dev_.xferMgr().writeBuffer(owner()->getHostMem(), *this,
|
|
amd::Coord3D(0), amd::Coord3D(size()), true);
|
|
if (!ret) {
|
|
hsacoreapi->HsaFreeDeviceMemory(deviceMemory_);
|
|
deviceMemory_ = NULL;
|
|
}
|
|
return ret;
|
|
}
|
|
// if device memory is depleted, do not fall back to system memory
|
|
return deviceMemory_ != NULL;
|
|
}
|
|
else if (!(owner()->getHostMem())) {
|
|
flags_ |= HostMemoryDirectAccess;
|
|
deviceMemory_ = dev_.hostAlloc(size(), alignment);
|
|
// no need to copy - otherwise, the host pointer will not be NULL
|
|
return deviceMemory_ != NULL;
|
|
}
|
|
}
|
|
|
|
flags_ |= HostMemoryDirectAccess;
|
|
void* hostMem = owner()->getHostMem();
|
|
assert(hostMem);
|
|
// If there is a host ptr, then register it only if it was not allocated,
|
|
// (=> allocated by us)
|
|
if (!(owner()->getHostMemRef()->alloced())) {
|
|
// Reuse existing host memory for the backing storage and register it.
|
|
//
|
|
// SVM precludes a possible 64-bits optimization in which host buffers
|
|
// allocated by the user (UHP) in the default, coherent space could be
|
|
// mapped into the non-coherent space by means of CreateFileMapping/mmap
|
|
// without copying any data (the "device memory" would be the
|
|
// non-coherent buffer).
|
|
// The optimization cannot be applied because regular buffers allocated
|
|
// using UHP are expected to have same characteristics as the original
|
|
// buffer, i.e., if the original buffer supports atomics then the
|
|
// corresponding OpenCL buffer will support atomics too.
|
|
flags_ |= HostMemoryRegistered;
|
|
if (hsacoreapi->HsaRegisterSystemMemory(hostMem, size()) != kHsaStatusSuccess) {
|
|
LogError("[OCL] Failed to register system memory");
|
|
return false;
|
|
}
|
|
}
|
|
deviceMemory_ = hostMem;
|
|
return true;
|
|
}
|
|
|
|
bool
|
|
Buffer::recreate(size_t newSize, size_t newAlignment, bool forceSystem) {
|
|
const size_t memFlag = static_cast<size_t>(owner()->getMemFlags());
|
|
if ((memFlag & CL_MEM_ALLOC_HOST_PTR) ||
|
|
(memFlag & CL_MEM_USE_HOST_PTR) ||
|
|
!dev_.settings().enableLocalMemory_) {
|
|
forceSystem = true;
|
|
}
|
|
|
|
void *newDeviceMemory = NULL;
|
|
uint hostDirectAccess = 0;
|
|
|
|
if (forceSystem) {
|
|
newDeviceMemory = dev_.hostAlloc(newSize, newAlignment);
|
|
if (newDeviceMemory == NULL) {
|
|
LogError("[OCL] Fail to reallocate system memory");
|
|
return false;
|
|
}
|
|
|
|
// Copy the old data to the new memory location.
|
|
if (!dev_.xferMgr().readBuffer(*this, newDeviceMemory,
|
|
amd::Coord3D(0),
|
|
amd::Coord3D(size()),
|
|
true)) {
|
|
LogError("[OCL] Fail to copy the current value");
|
|
dev_.hostFree(newDeviceMemory);
|
|
newDeviceMemory = NULL;
|
|
return false;
|
|
}
|
|
|
|
hostDirectAccess = HostMemoryDirectAccess;
|
|
}
|
|
else {
|
|
hsacoreapi->HsaAllocateDeviceMemory(
|
|
newSize, newAlignment, dev_.getBackendDevice(), &newDeviceMemory);
|
|
|
|
if (newDeviceMemory == NULL) {
|
|
LogError("[OCL] Fail to reallocate device local memory");
|
|
return false;
|
|
}
|
|
|
|
assert(
|
|
amd::isMultipleOf(static_cast<char *>(newDeviceMemory),
|
|
newAlignment));
|
|
|
|
// Copy the old data to the new memory location.
|
|
if (!dev_.xferMgr().readBuffer(
|
|
*this, newDeviceMemory, amd::Coord3D(0), amd::Coord3D(size()),
|
|
true)) {
|
|
LogError("[OCL] Fail to copy the current value");
|
|
hsacoreapi->HsaFreeDeviceMemory(newDeviceMemory);
|
|
newDeviceMemory = NULL;
|
|
return false;
|
|
}
|
|
}
|
|
|
|
destroy();
|
|
|
|
deviceMemory_ = newDeviceMemory;
|
|
|
|
if ((memFlag & CL_MEM_ALLOC_HOST_PTR) &&
|
|
(owner()->getContext().devices().size() == 1)) {
|
|
owner()->setHostMem(deviceMemory_);
|
|
}
|
|
|
|
flags_ &= (~HostMemoryDirectAccess & ~HostMemoryRegistered);
|
|
flags_ |= hostDirectAccess;
|
|
|
|
return true;
|
|
}
|
|
|
|
/////////////////////////////////oclhsa::Image//////////////////////////////
|
|
|
|
Image::Image(const oclhsa::Device& dev, amd::Memory& owner) :
|
|
oclhsa::Memory(dev, owner)
|
|
{
|
|
flags_ &= (~HostMemoryDirectAccess & ~HostMemoryRegistered);
|
|
populateImageDescriptor();
|
|
}
|
|
|
|
struct ImageFormatLayout {
|
|
cl_image_format clFormat;
|
|
HsaImageFormat hsaFormat;
|
|
};
|
|
|
|
static const ImageFormatLayout
|
|
ImageFormatLayoutMap[] = {
|
|
{ { CL_R, CL_UNORM_INT8 }, HSA_IMAGE_FMT_R8_UNORM },
|
|
{ { CL_R, CL_UNORM_INT16}, HSA_IMAGE_FMT_R16_UNORM },
|
|
{ { CL_R, CL_SNORM_INT8 }, HSA_IMAGE_FMT_R8_SNORM },
|
|
{ { CL_R, CL_SNORM_INT16}, HSA_IMAGE_FMT_R16_SNORM },
|
|
{ { CL_R, CL_SIGNED_INT8}, HSA_IMAGE_FMT_R8_SINT },
|
|
{ { CL_R, CL_SIGNED_INT16}, HSA_IMAGE_FMT_R16_SINT},
|
|
{ { CL_R, CL_SIGNED_INT32}, HSA_IMAGE_FMT_R32_SINT},
|
|
{ { CL_R, CL_UNSIGNED_INT8},HSA_IMAGE_FMT_R8_UINT },
|
|
{ { CL_R, CL_UNSIGNED_INT16}, HSA_IMAGE_FMT_R16_UINT},
|
|
{ { CL_R, CL_UNSIGNED_INT32}, HSA_IMAGE_FMT_R32_UINT},
|
|
{ { CL_R, CL_HALF_FLOAT}, HSA_IMAGE_FMT_R_HALFFLOAT},
|
|
{ { CL_R, CL_FLOAT }, HSA_IMAGE_FMT_R_FLOAT},
|
|
{ { CL_A, CL_UNORM_INT8 }, HSA_IMAGE_FMT_A8_UNORM},
|
|
{ { CL_A, CL_UNORM_INT16 }, HSA_IMAGE_FMT_A16_UNORM},
|
|
{ { CL_A, CL_SNORM_INT8 }, HSA_IMAGE_FMT_A8_SNORM},
|
|
{ { CL_A, CL_SNORM_INT16 }, HSA_IMAGE_FMT_A16_SNORM},
|
|
{ { CL_A, CL_SIGNED_INT8 }, HSA_IMAGE_FMT_A8_SINT},
|
|
{ { CL_A, CL_SIGNED_INT16 },HSA_IMAGE_FMT_A16_SINT},
|
|
{ { CL_A, CL_SIGNED_INT32}, HSA_IMAGE_FMT_A32_SINT},
|
|
{ { CL_A, CL_UNSIGNED_INT8 },HSA_IMAGE_FMT_A8_UINT},
|
|
{ { CL_A, CL_UNSIGNED_INT16}, HSA_IMAGE_FMT_A16_UINT},
|
|
{ { CL_A, CL_UNSIGNED_INT32}, HSA_IMAGE_FMT_A32_UINT},
|
|
{ { CL_A, CL_HALF_FLOAT}, HSA_IMAGE_FMT_A_HALFFLOAT},
|
|
{ { CL_A, CL_FLOAT}, HSA_IMAGE_FMT_A_FLOAT},
|
|
{ { CL_RG,CL_UNORM_INT8}, HSA_IMAGE_FMT_R8G8_UNORM},
|
|
{ { CL_RG,CL_UNORM_INT16},HSA_IMAGE_FMT_R16G16_UNORM},
|
|
{ { CL_RG,CL_SNORM_INT8}, HSA_IMAGE_FMT_R8G8_SNORM},
|
|
{ { CL_RG,CL_SNORM_INT16},HSA_IMAGE_FMT_R16G16_SNORM},
|
|
{ { CL_RG,CL_SIGNED_INT8},HSA_IMAGE_FMT_R8G8_SINT},
|
|
{ { CL_RG,CL_SIGNED_INT16},HSA_IMAGE_FMT_R16G16_SINT},
|
|
{ { CL_RG,CL_SIGNED_INT32},HSA_IMAGE_FMT_R32G32_SINT},
|
|
{ { CL_RG,CL_UNSIGNED_INT8},HSA_IMAGE_FMT_R8G8_UINT},
|
|
{ { CL_RG,CL_UNSIGNED_INT16},HSA_IMAGE_FMT_R16G16_UINT},
|
|
{ { CL_RG,CL_UNSIGNED_INT32},HSA_IMAGE_FMT_R32G32_UINT},
|
|
{ { CL_RG,CL_HALF_FLOAT},HSA_IMAGE_FMT_RG_HALFFLOAT},
|
|
{ { CL_RG,CL_FLOAT},HSA_IMAGE_FMT_RG_FLOAT},
|
|
{ { CL_RA,CL_UNORM_INT8}, HSA_IMAGE_FMT_R8A8_UNORM},
|
|
{ { CL_RA,CL_UNORM_INT16},HSA_IMAGE_FMT_R16A16_UNORM},
|
|
{ { CL_RA,CL_SNORM_INT8}, HSA_IMAGE_FMT_R8A8_SNORM},
|
|
{ { CL_RA,CL_SNORM_INT16},HSA_IMAGE_FMT_R16A16_SNORM},
|
|
{ { CL_RA,CL_SIGNED_INT8},HSA_IMAGE_FMT_R8A8_SINT},
|
|
{ { CL_RA,CL_SIGNED_INT16},HSA_IMAGE_FMT_R16A16_SINT},
|
|
{ { CL_RA,CL_SIGNED_INT32},HSA_IMAGE_FMT_R32A32_SINT},
|
|
{ { CL_RA,CL_UNSIGNED_INT8},HSA_IMAGE_FMT_R8A8_UINT},
|
|
{ { CL_RA,CL_UNSIGNED_INT16},HSA_IMAGE_FMT_R16A16_UINT},
|
|
{ { CL_RA,CL_UNSIGNED_INT32},HSA_IMAGE_FMT_R32A32_UINT},
|
|
{ { CL_RA,CL_HALF_FLOAT},HSA_IMAGE_FMT_RA_HALFFLOAT},
|
|
{ { CL_RA,CL_FLOAT},HSA_IMAGE_FMT_RA_FLOAT},
|
|
{ { CL_RGBA,CL_UNORM_INT8}, HSA_IMAGE_FMT_R8G8B8A8_UNORM},
|
|
{ { CL_RGBA,CL_UNORM_INT16},HSA_IMAGE_FMT_R16G16B16A16_UNORM},
|
|
{ { CL_RGBA,CL_SNORM_INT8}, HSA_IMAGE_FMT_R8G8B8A8_SNORM},
|
|
{ { CL_RGBA,CL_SNORM_INT16},HSA_IMAGE_FMT_R16G16B16A16_SNORM},
|
|
{ { CL_RGBA,CL_SIGNED_INT8},HSA_IMAGE_FMT_R8G8B8A8_SINT},
|
|
{ { CL_RGBA,CL_SIGNED_INT16},HSA_IMAGE_FMT_R16G16B16A16_SINT},
|
|
{ { CL_RGBA,CL_SIGNED_INT32},HSA_IMAGE_FMT_R32G32B32A32_SINT},
|
|
{ { CL_RGBA,CL_UNSIGNED_INT8},HSA_IMAGE_FMT_R8G8B8A8_UINT},
|
|
{ { CL_RGBA,CL_UNSIGNED_INT16},HSA_IMAGE_FMT_R16G16B16A16_UINT},
|
|
{ { CL_RGBA,CL_UNSIGNED_INT32},HSA_IMAGE_FMT_R32G32B32A32_UINT},
|
|
{ { CL_RGBA,CL_HALF_FLOAT},HSA_IMAGE_FMT_RGBA_HALFFLOAT},
|
|
{ { CL_RGBA,CL_FLOAT},HSA_IMAGE_FMT_RGBA_FLOAT},
|
|
{ { CL_ARGB,CL_UNORM_INT8},HSA_IMAGE_FMT_A8R8G8B8_UNORM},
|
|
{ { CL_ARGB,CL_SNORM_INT8},HSA_IMAGE_FMT_A8R8G8B8_SNORM},
|
|
{ { CL_ARGB,CL_SIGNED_INT8},HSA_IMAGE_FMT_A8R8G8B8_SINT},
|
|
{ { CL_ARGB,CL_UNSIGNED_INT8},HSA_IMAGE_FMT_A8R8G8B8_UINT},
|
|
{ { CL_BGRA,CL_UNORM_INT8},HSA_IMAGE_FMT_B8G8R8A8_UNORM},
|
|
{ { CL_BGRA,CL_SNORM_INT8},HSA_IMAGE_FMT_B8G8R8A8_SNORM},
|
|
{ { CL_BGRA,CL_SIGNED_INT8},HSA_IMAGE_FMT_B8G8R8A8_SINT},
|
|
{ { CL_BGRA,CL_UNSIGNED_INT8},HSA_IMAGE_FMT_B8G8R8A8_UINT},
|
|
{ {CL_LUMINANCE,CL_SNORM_INT8}, HSA_IMAGE_FMT_L8_SNORM},
|
|
{ {CL_LUMINANCE,CL_SNORM_INT16},HSA_IMAGE_FMT_L16_SNORM},
|
|
{ {CL_LUMINANCE,CL_UNORM_INT8},HSA_IMAGE_FMT_L8_UNORM},
|
|
{ {CL_LUMINANCE,CL_UNORM_INT16},HSA_IMAGE_FMT_L16_UNORM},
|
|
{ {CL_LUMINANCE,CL_HALF_FLOAT},HSA_IMAGE_FMT_L_HALFFLOAT},
|
|
{ {CL_LUMINANCE,CL_FLOAT},HSA_IMAGE_FMT_L_FLOAT},
|
|
{ {CL_INTENSITY,CL_SNORM_INT8}, HSA_IMAGE_FMT_I8_SNORM},
|
|
{ {CL_INTENSITY,CL_SNORM_INT16},HSA_IMAGE_FMT_I16_SNORM},
|
|
{ {CL_INTENSITY,CL_UNORM_INT8},HSA_IMAGE_FMT_I8_UNORM},
|
|
{ {CL_INTENSITY,CL_UNORM_INT16},HSA_IMAGE_FMT_I16_UNORM},
|
|
{ {CL_INTENSITY,CL_HALF_FLOAT},HSA_IMAGE_FMT_I_HALFFLOAT},
|
|
{ {CL_INTENSITY,CL_FLOAT},HSA_IMAGE_FMT_I_FLOAT},
|
|
{ {CL_RGB, CL_UNORM_SHORT_565},HSA_IMAGE_FMT_R5G6B5_UNORM},
|
|
{ {CL_RGB, CL_UNORM_SHORT_555},HSA_IMAGE_FMT_R5G5B5_UNORM},
|
|
{ {CL_RGB, CL_UNORM_INT_101010},HSA_IMAGE_FMT_R10G10B10_UNORM}
|
|
};
|
|
|
|
void
|
|
Image::populateImageDescriptor()
|
|
{
|
|
amd::Image* image = owner()->asImage();
|
|
|
|
// build HSA runtime image descriptor
|
|
imageDescriptor_.width = image->getWidth();
|
|
imageDescriptor_.height = image->getHeight();
|
|
imageDescriptor_.depth = image->getDepth();
|
|
imageDescriptor_.arraySize = 0;
|
|
|
|
// Device specific image does not require rowpitch/slicepitch information.
|
|
// Only image buffer is required to specify rowpitch size.
|
|
imageDescriptor_.rowPitchInBytes = 0;
|
|
imageDescriptor_.slicePitchInBytes = 0;
|
|
|
|
switch (image->getType())
|
|
{
|
|
case CL_MEM_OBJECT_IMAGE1D:
|
|
imageDescriptor_.geometry = HSA_GEOMETRY_1D;
|
|
imageDescriptor_.height = 1;
|
|
imageDescriptor_.depth = 1;
|
|
break;
|
|
case CL_MEM_OBJECT_IMAGE1D_BUFFER:
|
|
imageDescriptor_.geometry = HSA_GEOMETRY_1DBuffer;
|
|
imageDescriptor_.height = 1;
|
|
imageDescriptor_.depth = 1;
|
|
break;
|
|
case CL_MEM_OBJECT_IMAGE1D_ARRAY:
|
|
//@todo - arraySize = height ?!
|
|
imageDescriptor_.geometry = HSA_GEOMETRY_1DArray;
|
|
imageDescriptor_. height = 1;
|
|
imageDescriptor_.arraySize = image->getHeight();
|
|
break;
|
|
case CL_MEM_OBJECT_IMAGE2D:
|
|
imageDescriptor_.geometry = HSA_GEOMETRY_2D;
|
|
imageDescriptor_.depth = 1;
|
|
break;
|
|
case CL_MEM_OBJECT_IMAGE2D_ARRAY:
|
|
//@todo - arraySize = depth ?!
|
|
imageDescriptor_.geometry = HSA_GEOMETRY_2DArray;
|
|
imageDescriptor_.depth = 1;
|
|
imageDescriptor_.arraySize = image->getDepth();
|
|
break;
|
|
case CL_MEM_OBJECT_IMAGE3D:
|
|
imageDescriptor_.geometry = HSA_GEOMETRY_3D;
|
|
break;
|
|
}
|
|
|
|
for (uint i = 0; i < sizeof(ImageFormatLayoutMap) / sizeof(ImageFormatLayout); ++i) {
|
|
if ((image->getImageFormat().image_channel_data_type ==
|
|
ImageFormatLayoutMap[i].clFormat.image_channel_data_type) &&
|
|
(image->getImageFormat().image_channel_order ==
|
|
ImageFormatLayoutMap[i].clFormat.image_channel_order)) {
|
|
imageDescriptor_.format = ImageFormatLayoutMap[i].hsaFormat;
|
|
}
|
|
}
|
|
}
|
|
|
|
bool Image::createInterop() {
|
|
amd::ScopedLock lock(owner()->lockMemoryOps());
|
|
amd::InteropObject *interopObject = owner()->getInteropObj();
|
|
void *hsaImageObjectInterop = NULL;
|
|
size_t hsaImageObjectInteropSize = 0;
|
|
#ifdef _WIN32
|
|
if (interopObject->asD3D10Object()) {
|
|
amd::D3D10Object *d3d10Object = interopObject->asD3D10Object();
|
|
// 1. Get the D3D11 resource
|
|
ID3D10Resource *resource = d3d10Object->getD3D10Resource();
|
|
HsaStatus status = hsacoreapi->HsaMapD3D10Texture(
|
|
dev_.getBackendDevice(), resource, &hsaImageObjectInterop,
|
|
&hsaImageObjectInteropSize, kHsaMapFlagsReadWrite);
|
|
if (status != kHsaStatusSuccess || hsaImageObjectInteropSize == 0 ) {
|
|
LogError("[OCL] Fail on HsaMapD3D10Texture");
|
|
return false;
|
|
}
|
|
interopType_ = InteropD3D10;
|
|
d3d10Resource_ = resource;
|
|
}
|
|
|
|
if (interopObject->asD3D11Object()) {
|
|
amd::D3D11Object *d3d11Object = interopObject->asD3D11Object();
|
|
|
|
// 1. Get the D3D11 resource
|
|
ID3D11Resource *resource = d3d11Object->getD3D11Resource();
|
|
HsaStatus status = hsacoreapi->HsaMapD3D11Texture(
|
|
dev_.getBackendDevice(), resource, &hsaImageObjectInterop,
|
|
&hsaImageObjectInteropSize, kHsaMapFlagsReadWrite,
|
|
d3d11Object->getPlane());
|
|
if (status != kHsaStatusSuccess || hsaImageObjectInteropSize == 0 ) {
|
|
LogError("[OCL] Fail on HsaMapD3D11Texture");
|
|
return false;
|
|
}
|
|
interopType_ = InteropD3D11;
|
|
d3d11Resource_ = resource;
|
|
}
|
|
#endif
|
|
|
|
if (interopObject->asGLObject()) {
|
|
amd::GLObject* gl_object = interopObject->asGLObject();
|
|
HsaGLResource gl_resource = {0};
|
|
gl_resource.name = gl_object->getGLName();
|
|
if (gl_object->getGLTarget() != GL_TEXTURE_CUBE_MAP) {
|
|
gl_resource.type = gl_object->getGLTarget();
|
|
}
|
|
else {
|
|
gl_resource.type = gl_object->getCubemapFace();
|
|
}
|
|
gl_resource.mipmap_level = gl_object->getGLMipLevel();
|
|
|
|
void * glContext =owner()->getContext().info().hCtx_;
|
|
|
|
// Get the texture SRD.
|
|
HsaStatus status = hsacoreapi->HsaMapGLTexture(
|
|
dev_.getBackendDevice(), glContext, &gl_resource,
|
|
&hsaImageObjectInterop, &hsaImageObjectInteropSize);
|
|
if (status != kHsaStatusSuccess || hsaImageObjectInteropSize == 0) {
|
|
LogError("[OCL] Fail on HsaMapGLTexture");
|
|
return false;
|
|
}
|
|
|
|
status = hsacoreapi->HsaAcquireGLResources( dev_.getBackendDevice(),
|
|
glContext,
|
|
&gl_resource,
|
|
1);
|
|
|
|
if (status != kHsaStatusSuccess) {
|
|
LogError("[OCL] Fail on HsaAcquireGLResources");
|
|
return false;
|
|
}
|
|
|
|
// Get the flat address for texture buffer.
|
|
if (owner()->getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER) {
|
|
// Map the texture buffer resource as buffer.
|
|
HsaStatus status = hsacoreapi->HsaMapGLBuffer(
|
|
dev_.getBackendDevice(), glContext, &gl_resource,
|
|
&deviceMemory_);
|
|
if (status != kHsaStatusSuccess) {
|
|
LogError("[OCL] Fail on HsaMapGLBuffer");
|
|
return false;
|
|
}
|
|
// Sanity check.
|
|
assert((deviceMemory_ != NULL) &&
|
|
"deviceMemory_ should not be \
|
|
NULL upon successful return from HsaMapGLBuffer");
|
|
}
|
|
|
|
interopType_ = InteropGL;
|
|
glResource_ = gl_resource;
|
|
}
|
|
|
|
// Populate HSA specific information to the interop image object.
|
|
HsaStatus status = hsacoreapi->HsaAmdCreateDeviceImageView(
|
|
&imageDescriptor_, hsaImageObjectInterop, hsaImageObject_);
|
|
if (status != kHsaStatusSuccess) {
|
|
LogError("[OCL] Fail to tranform interop image SRD");
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
bool Image::create()
|
|
{
|
|
if (owner()->parent()) {
|
|
// Image view creation
|
|
oclhsa::Image *parentImage =
|
|
static_cast<oclhsa::Image *>(owner()->parent()->getDeviceMemory(dev_));
|
|
|
|
if (parentImage == NULL) {
|
|
LogError("[OCL] Fail to allocate parent image");
|
|
return false;
|
|
}
|
|
|
|
return createView(*parentImage);
|
|
}
|
|
|
|
amd::ScopedLock lock(owner()->lockMemoryOps());
|
|
|
|
// Get memory size requirement for device specific image.
|
|
HsaStatus status = hsacoreapi->HsaGetDeviceImageInfo(
|
|
dev_.getBackendDevice(), &imageDescriptor_,
|
|
&deviceImageInfo_);
|
|
|
|
if (status != kHsaStatusSuccess) {
|
|
LogError("[OCL] Fail to allocate image memory");
|
|
return false;
|
|
}
|
|
|
|
if (dev_.settings().enableLocalMemory_) {
|
|
status = hsacoreapi->HsaAllocateDeviceMemory(
|
|
deviceImageInfo_.imageSizeInBytes,
|
|
deviceImageInfo_.imageAlignmentInBytes,
|
|
dev_.getBackendDevice(),
|
|
&deviceMemory_);
|
|
} else {
|
|
status = servicesapi->HsaAllocateSystemMemory(
|
|
deviceImageInfo_.imageSizeInBytes,
|
|
deviceImageInfo_.imageAlignmentInBytes,
|
|
kHsaSystemMemoryTypeDefault,
|
|
&deviceMemory_);
|
|
}
|
|
|
|
if (status != kHsaStatusSuccess) {
|
|
LogError("[OCL] Fail to allocate image memory");
|
|
return false;
|
|
}
|
|
|
|
assert(amd::isMultipleOf(
|
|
deviceMemory_, deviceImageInfo_.imageAlignmentInBytes));
|
|
|
|
status = hsacoreapi->HsaCreateDeviceImage(
|
|
dev_.getBackendDevice(), &imageDescriptor_,
|
|
deviceMemory_, &hsaImageObject_[0]);
|
|
|
|
return true;
|
|
}
|
|
|
|
bool
|
|
Image::createView(Image &parent)
|
|
{
|
|
amd::ScopedLock lock(owner()->lockMemoryOps());
|
|
|
|
if (parent.owner()->asBuffer()) {
|
|
// Get new texture SRD since parent is a buffer.
|
|
deviceMemory_ = parent.getDeviceMemory();
|
|
|
|
// Force device specific image implementation to use rowpitch size.
|
|
amd::Image* image = owner()->asImage();
|
|
imageDescriptor_.rowPitchInBytes = image->getRowPitch();
|
|
|
|
HsaStatus status = hsacoreapi->HsaCreateDeviceImage(
|
|
dev_.getBackendDevice(), &imageDescriptor_,
|
|
deviceMemory_, &hsaImageObject_[0]);
|
|
|
|
if (status != kHsaStatusSuccess) {
|
|
LogError("[OCL] Fail to create HSA image object");
|
|
return false;
|
|
}
|
|
} else {
|
|
// Get the view of the existing parent's SRD based on the child's image
|
|
// descriptor.
|
|
HsaStatus status = hsacoreapi->HsaAmdCreateDeviceImageView(
|
|
&imageDescriptor_, parent.getHsaImageObjectAddress(),
|
|
&hsaImageObject_[0]);
|
|
if (status != kHsaStatusSuccess) {
|
|
LogError("[OCL] Fail to get view of parent image");
|
|
return false;
|
|
}
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
void* Image::allocMapTarget(const amd::Coord3D& origin,
|
|
const amd::Coord3D& region,
|
|
uint mapFlags,
|
|
size_t* rowPitch,
|
|
size_t* slicePitch)
|
|
{
|
|
amd::ScopedLock lock(owner()->lockMemoryOps());
|
|
|
|
incIndMapCount();
|
|
|
|
void* pHostMem = owner()->getHostMem();
|
|
|
|
if (pHostMem == NULL) {
|
|
if (indirectMapCount_ == 1) {
|
|
if (!allocateMapMemory(owner()->getSize())) {
|
|
decIndMapCount();
|
|
return NULL;
|
|
}
|
|
}
|
|
else {
|
|
// Did the map resource allocation fail?
|
|
if (mapMemory_ == NULL) {
|
|
LogError("Could not map target resource");
|
|
return NULL;
|
|
}
|
|
}
|
|
|
|
pHostMem = mapMemory_->getHostMem();
|
|
}
|
|
|
|
amd::Image* image = owner()->asImage();
|
|
|
|
size_t elementSize = image->getImageFormat().getElementSize();
|
|
|
|
size_t offset = origin[0] * elementSize;
|
|
|
|
// Adjust offset with Y dimension
|
|
offset += image->getRowPitch() * origin[1];
|
|
|
|
// Adjust offset with Z dimension
|
|
offset += image->getSlicePitch() * origin[2];
|
|
|
|
*rowPitch = image->getRowPitch();
|
|
if (slicePitch != NULL)
|
|
*slicePitch = image->getSlicePitch();
|
|
|
|
return (static_cast<uint8_t*>(pHostMem) + offset);
|
|
}
|
|
|
|
Image::~Image()
|
|
{
|
|
destroy();
|
|
}
|
|
|
|
void
|
|
Image::destroy()
|
|
{
|
|
if (owner()->parent() != NULL) {
|
|
return;
|
|
}
|
|
|
|
if (owner()->isInterop()) {
|
|
destroyInterop();
|
|
return;
|
|
}
|
|
|
|
if (dev_.settings().enableLocalMemory_) {
|
|
hsacoreapi->HsaFreeDeviceMemory(deviceMemory_);
|
|
}
|
|
else {
|
|
servicesapi->HsaFreeSystemMemory(deviceMemory_);
|
|
}
|
|
}
|
|
}
|
|
#endif // WITHOUT_FSA_BACKEND
|