Files
rocm-systems/rocclr/runtime/device/hsa/hsamemory.cpp
T
foreman 6d464be252 P4 to Git Change 1101352 by gandryey@gera-dev-w7 on 2014/11/28 18:03:18
ECR #304775 - Make optimization for read map of USWC memory
	- If runtime detects USWC map with read operation, then it will switch to indirect map. This should improve map-read  performance on APU(s)  when USWC memory is used instead of frame buffer

Affected files ...

... //depot/stg/opencl/drivers/opencl/api/opencl/amdocl/cl_memobj.cpp#72 edit
... //depot/stg/opencl/drivers/opencl/api/opencl/amdocl/cl_svm.cpp#8 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/cpu/cpudevice.cpp#269 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/cpu/cpudevice.hpp#89 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/device.cpp#172 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/device.hpp#234 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudevice.cpp#486 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudevice.hpp#134 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpumemory.cpp#112 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpumemory.hpp#43 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuvirtual.cpp#340 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/hsa/hsadevice.cpp#88 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/hsa/hsadevice.hpp#45 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/hsa/hsamemory.cpp#42 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/hsa/hsamemory.hpp#27 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/hsa/hsavirtual.cpp#98 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/hsa_foundation/hsadevice.cpp#21 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/hsa_foundation/hsadevice.hpp#7 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/hsa_foundation/hsamemory.cpp#6 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/hsa_foundation/hsamemory.hpp#5 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/hsa_foundation/hsavirtual.cpp#26 edit
2014-11-28 18:11:36 -05:00

939 lines
32 KiB
C++

//
// Copyright (c) 2008 Advanced Micro Devices, Inc. All rights reserved.
//
#ifndef WITHOUT_FSA_BACKEND
#include "CL/cl_ext.h"
#include "device/device.hpp"
#include "device/hsa/hsamemory.hpp"
#include "device/hsa/hsadevice.hpp"
#include "device/hsa/hsablit.hpp"
#include "device/hsa/oclhsa_common.hpp"
#include "thread/monitor.hpp"
#include "platform/memory.hpp"
#include "platform/sampler.hpp"
namespace oclhsa {
/////////////////////////////////oclhsa::Memory//////////////////////////////
Memory::Memory(const oclhsa::Device &dev, amd::Memory &owner)
: device::Memory(owner),
dev_(dev),
deviceMemory_(NULL),
interopType_(InteropNone)
{
}
Memory::~Memory()
{}
bool
Memory::allocateMapMemory(size_t allocationSize)
{
assert(mapMemory_ == NULL);
void *mapData = NULL;
// Use/reuse system memory from HSA system memory pool as backing
// storage of the map target.
if (kHsaStatusSuccess !=
servicesapi->HsaAllocateSystemMemory(
owner()->getSize(), 0, kHsaSystemMemoryTypeDefault, &mapData)) {
LogError("[OCL] Fail to allocate the backing storage for map target");
return false;
}
// Create buffer object to contain the map target.
amd::Memory *mapMemory =
new(owner()->getContext()) amd::Buffer(
owner()->getContext(), CL_MEM_USE_HOST_PTR, owner()->getSize());
if ((mapMemory == NULL) || (!mapMemory->create(mapData))) {
LogError("[OCL] Fail to allocate map target object");
servicesapi->HsaFreeSystemMemory(mapData);
if (mapMemory) {
mapMemory->release();
}
return false;
}
mapMemory_ = mapMemory;
return true;
}
void
Memory::freeMapMemory()
{
// Return the memory to HSA system memory pool.
assert(mapMemory_ != NULL);
servicesapi->HsaFreeSystemMemory(mapMemory_->getHostMem());
// Release the buffer object containing the map data.
mapMemory_->release();
mapMemory_ = NULL;
}
void *
Memory::allocMapTarget(const amd::Coord3D &origin,
const amd::Coord3D &region,
uint mapFlags,
size_t *rowPitch,
size_t *slicePitch)
{
// Map/Unmap must be serialized.
amd::ScopedLock lock(owner()->lockMemoryOps());
incIndMapCount();
// If the device backing storage is direct accessible, use it.
if (isHostMemDirectAccess()) {
return (static_cast<char *>(deviceMemory_) + origin[0]);
}
// Otherwise, check for host memory.
void *hostMem = owner()->getHostMem();
if (hostMem != NULL) {
return (static_cast<char *>(hostMem) + origin[0]);
}
// Allocate one if needed.
if (indirectMapCount_ == 1) {
if (!allocateMapMemory(owner()->getSize())) {
decIndMapCount();
return NULL;
}
}
else {
// Did the map resource allocation fail?
if (mapMemory_ == NULL) {
LogError("Could not map target resource");
return NULL;
}
}
return (static_cast<char *>(mapMemory_->getHostMem()) + origin[0]);
}
void
Memory::decIndMapCount()
{
// Map/Unmap must be serialized.
amd::ScopedLock lock(owner()->lockMemoryOps());
if (indirectMapCount_ == 0) {
LogError("decIndMapCount() called when indirectMapCount_ already zero");
return;
}
// Decrement the counter and release indirect map if it's the last op
if (--indirectMapCount_ == 0 &&
mapMemory_ != NULL) {
freeMapMemory();
}
}
void *
Memory::cpuMap(
device::VirtualDevice& vDev,
uint flags,
uint startLayer,
uint numLayers,
size_t* rowPitch,
size_t* slicePitch
)
{
// Create the map target.
void * mapTarget =
allocMapTarget(amd::Coord3D(0), amd::Coord3D(0), 0, rowPitch, slicePitch);
// Sync to map target if no direct access.
if (!isHostMemDirectAccess()) {
if (!vDev.blitMgr().readBuffer(
*this, mapTarget, amd::Coord3D(0), amd::Coord3D(size()), true)) {
decIndMapCount();
return NULL;
}
}
return mapTarget;
}
void
Memory::cpuUnmap(device::VirtualDevice& vDev)
{
// Sync to device backing storage if no direct access.
if (!isHostMemDirectAccess()) {
if (!vDev.blitMgr().writeBuffer(
mapMemory_->getHostMem(), *this, amd::Coord3D(0),
amd::Coord3D(size()), true)) {
LogError("[OCL] Fail sync the device memory on cpuUnmap");
}
}
decIndMapCount();
}
void Memory::destroyInterop()
{
HsaStatus status;
#ifdef _WIN32
if (interopType_ == InteropD3D10) {
HsaStatus status = hsacoreapi->HsaUnmapD3D10Resource(
dev_.getBackendDevice(), d3d10Resource_);
if (status != kHsaStatusSuccess) {
LogError("[OCL] Fail on HsaUnmapD3D10Resource");
return;
}
}
else if (interopType_ == InteropD3D11) {
HsaStatus status = hsacoreapi->HsaUnmapD3D11Resource(
dev_.getBackendDevice(), d3d11Resource_);
if (status != kHsaStatusSuccess) {
LogError("[OCL] Fail on HsaUnmapD3D11Resource");
return;
}
}
#endif
if (interopType_ == InteropGL) {
void * glContext =owner()->getContext().info().hCtx_;
status = hsacoreapi->HsaReleaseGLResources( dev_.getBackendDevice(),
glContext,
&glResource_,
1);
if (kHsaStatusSuccess != status) {
LogError("[OCL] Fail on HsaReleaseGLResources");
}
status = hsacoreapi->HsaUnmapGLResource(
dev_.getBackendDevice(), glContext, &glResource_);
if (status != kHsaStatusSuccess) {
LogError("[OCL] Fail on HsaUnmapGLResource");
return;
}
}
}
bool
Memory::isHsaLocalMemory() const {
if (owner()->isInterop()) {
return true;
}
else {
if (amd::Is64Bits()) {
uint64_t addr = reinterpret_cast<uint64_t>(deviceMemory_);
// Fast check: in 64 bits, CPU can only access the high area
// (VA[63:47] == 0x1FFFF) and low area (VA[63:47 == 0).
// Reference: GFXIP7_ShaderIO_Delt.doc
addr >>= 47; // discard least significant 47 bits
return (addr != 0x1FFFF && addr != 0);
}
else {
const HsaMemoryDescriptor &memDesc =
dev_.getBackendDevice()->memory_descriptors[0];
if (memDesc.heap_type == kHsaHeapTypeFrameBufferPrivate) {
const uintptr_t addr =
reinterpret_cast<uintptr_t>(deviceMemory_);
const uintptr_t gpuvmBase = memDesc.virtual_base_address;
const size_t size = memDesc.size_in_bytes;
return (addr >= gpuvmBase && addr < (gpuvmBase + size));
}
}
}
return false;
}
/////////////////////////////////oclhsa::Buffer//////////////////////////////
Buffer::Buffer(const oclhsa::Device &dev, amd::Memory &owner)
: oclhsa::Memory(dev, owner)
{}
Buffer::~Buffer()
{
destroy();
}
void
Buffer::destroy()
{
if (owner()->parent() != NULL) {
return;
}
if (owner()->isInterop()) {
destroyInterop();
return;
}
if (isHostMemoryRegistered()) {
hsacoreapi->HsaDeregisterSystemMemory(deviceMemory_);
}
else {
if (!isHostMemDirectAccess()) {
hsacoreapi->HsaFreeDeviceMemory(deviceMemory_);
}
else if (deviceMemory_ != owner()->getHostMem()) {
// if they are identical, the host pointer will be
// deallocated later on => avoid double deallocation
hsacoreapi->HsaAmdFreeSystemMemory(deviceMemory_);
}
}
}
bool Buffer::createInterop()
{
amd::InteropObject *interopObject = owner()->getInteropObj();
#ifdef _WIN32
if (interopObject->asD3D10Object() != NULL) {
amd::D3D10Object *d3d10Object = interopObject->asD3D10Object();
// 1. Get the D3D11 resource
ID3D10Resource *resource = d3d10Object->getD3D10Resource();
ID3D10Buffer *d3d10Buffer = static_cast<ID3D10Buffer *>(resource);
HsaStatus status = hsacoreapi->HsaMapD3D10Buffer(
dev_.getBackendDevice(), d3d10Buffer, &deviceMemory_);
if (status != kHsaStatusSuccess) {
LogError("[OCL] Fail on HsaMapD3D10Buffer");
return false;
}
interopType_ = InteropD3D10;
d3d10Resource_ = d3d10Buffer;
}
if (interopObject->asD3D11Object() != NULL) {
amd::D3D11Object *d3d11Object = interopObject->asD3D11Object();
// 1. Get the D3D11 resource
ID3D11Resource *resource = d3d11Object->getD3D11Resource();
ID3D11Buffer *d3d11Buffer = static_cast<ID3D11Buffer *>(resource);
HsaStatus status = hsacoreapi->HsaMapD3D11Buffer(
dev_.getBackendDevice(), d3d11Buffer, &deviceMemory_);
if (status != kHsaStatusSuccess) {
LogError("[OCL] Fail on HsaMapD3D10Buffer");
return false;
}
interopType_ = InteropD3D11;
d3d11Resource_ = d3d11Buffer;
}
#endif
if (interopObject->asBufferGL()) {
amd::BufferGL *buffer_gl = interopObject->asBufferGL();
HsaGLResource gl_resource = {0};
gl_resource.name = buffer_gl->getGLName();
gl_resource.type = buffer_gl->getGLInternalFormat();
void * glContext =owner()->getContext().info().hCtx_;
HsaStatus status = hsacoreapi->HsaMapGLBuffer(
dev_.getBackendDevice(), glContext, &gl_resource, &deviceMemory_);
if (status != kHsaStatusSuccess) {
LogError("[OCL] Fail on HsaMapGLBuffer");
return false;
}
status = hsacoreapi->HsaAcquireGLResources( dev_.getBackendDevice(),
glContext,
&gl_resource,
1);
if (status != kHsaStatusSuccess) {
LogError("[OCL] Fail on HsaAcquireGLResources");
return false;
}
interopType_ = InteropGL;
glResource_ = gl_resource;
}
return true;
}
bool
Buffer::create()
{
if (owner()->parent()) {
// Sub-Buffer creation.
oclhsa::Memory *parentBuffer =
static_cast<oclhsa::Memory *>(owner()->parent()->getDeviceMemory(dev_));
if (parentBuffer == NULL) {
LogError("[OCL] Fail to allocate parent buffer");
return false;
}
const size_t offset = owner()->getOrigin();
deviceMemory_ =
static_cast<char *>(parentBuffer->getDeviceMemory()) + offset;
void* parentHostPtr = parentBuffer->owner()->getHostMem();
if (parentHostPtr) {
owner()->setHostMem(static_cast<char *>(parentHostPtr) + offset);
}
flags_ |= owner()->parent()->getMemFlags();
return true;
}
// Allocate backing storage in device local memory unless UHP or AHP are set
const cl_mem_flags memFlags = owner()->getMemFlags();
if (!(memFlags & (CL_MEM_USE_HOST_PTR | CL_MEM_ALLOC_HOST_PTR))) {
bool useDeviceMemory = dev_.settings().enableLocalMemory_;
size_t alignment = static_cast<size_t>(dev_.info().memBaseAddrAlign_);
if (useDeviceMemory) {
hsacoreapi->HsaAllocateDeviceMemory(
size(), alignment, dev_.getBackendDevice(), &deviceMemory_);
if (deviceMemory_ && (memFlags & CL_MEM_COPY_HOST_PTR)) {
bool ret = dev_.xferMgr().writeBuffer(owner()->getHostMem(), *this,
amd::Coord3D(0), amd::Coord3D(size()), true);
if (!ret) {
hsacoreapi->HsaFreeDeviceMemory(deviceMemory_);
deviceMemory_ = NULL;
}
return ret;
}
// if device memory is depleted, do not fall back to system memory
return deviceMemory_ != NULL;
}
else if (!(owner()->getHostMem())) {
flags_ |= HostMemoryDirectAccess;
deviceMemory_ = dev_.hostAlloc(size(), alignment);
// no need to copy - otherwise, the host pointer will not be NULL
return deviceMemory_ != NULL;
}
}
flags_ |= HostMemoryDirectAccess;
void* hostMem = owner()->getHostMem();
assert(hostMem);
// If there is a host ptr, then register it only if it was not allocated,
// (=> allocated by us)
if (!(owner()->getHostMemRef()->alloced())) {
// Reuse existing host memory for the backing storage and register it.
//
// SVM precludes a possible 64-bits optimization in which host buffers
// allocated by the user (UHP) in the default, coherent space could be
// mapped into the non-coherent space by means of CreateFileMapping/mmap
// without copying any data (the "device memory" would be the
// non-coherent buffer).
// The optimization cannot be applied because regular buffers allocated
// using UHP are expected to have same characteristics as the original
// buffer, i.e., if the original buffer supports atomics then the
// corresponding OpenCL buffer will support atomics too.
flags_ |= HostMemoryRegistered;
if (hsacoreapi->HsaRegisterSystemMemory(hostMem, size()) != kHsaStatusSuccess) {
LogError("[OCL] Failed to register system memory");
return false;
}
}
deviceMemory_ = hostMem;
return true;
}
bool
Buffer::recreate(size_t newSize, size_t newAlignment, bool forceSystem) {
const size_t memFlag = static_cast<size_t>(owner()->getMemFlags());
if ((memFlag & CL_MEM_ALLOC_HOST_PTR) ||
(memFlag & CL_MEM_USE_HOST_PTR) ||
!dev_.settings().enableLocalMemory_) {
forceSystem = true;
}
void *newDeviceMemory = NULL;
uint hostDirectAccess = 0;
if (forceSystem) {
newDeviceMemory = dev_.hostAlloc(newSize, newAlignment);
if (newDeviceMemory == NULL) {
LogError("[OCL] Fail to reallocate system memory");
return false;
}
// Copy the old data to the new memory location.
if (!dev_.xferMgr().readBuffer(*this, newDeviceMemory,
amd::Coord3D(0),
amd::Coord3D(size()),
true)) {
LogError("[OCL] Fail to copy the current value");
dev_.hostFree(newDeviceMemory);
newDeviceMemory = NULL;
return false;
}
hostDirectAccess = HostMemoryDirectAccess;
}
else {
hsacoreapi->HsaAllocateDeviceMemory(
newSize, newAlignment, dev_.getBackendDevice(), &newDeviceMemory);
if (newDeviceMemory == NULL) {
LogError("[OCL] Fail to reallocate device local memory");
return false;
}
assert(
amd::isMultipleOf(static_cast<char *>(newDeviceMemory),
newAlignment));
// Copy the old data to the new memory location.
if (!dev_.xferMgr().readBuffer(
*this, newDeviceMemory, amd::Coord3D(0), amd::Coord3D(size()),
true)) {
LogError("[OCL] Fail to copy the current value");
hsacoreapi->HsaFreeDeviceMemory(newDeviceMemory);
newDeviceMemory = NULL;
return false;
}
}
destroy();
deviceMemory_ = newDeviceMemory;
if ((memFlag & CL_MEM_ALLOC_HOST_PTR) &&
(owner()->getContext().devices().size() == 1)) {
owner()->setHostMem(deviceMemory_);
}
flags_ &= (~HostMemoryDirectAccess & ~HostMemoryRegistered);
flags_ |= hostDirectAccess;
return true;
}
/////////////////////////////////oclhsa::Image//////////////////////////////
Image::Image(const oclhsa::Device& dev, amd::Memory& owner) :
oclhsa::Memory(dev, owner)
{
flags_ &= (~HostMemoryDirectAccess & ~HostMemoryRegistered);
populateImageDescriptor();
}
struct ImageFormatLayout {
cl_image_format clFormat;
HsaImageFormat hsaFormat;
};
static const ImageFormatLayout
ImageFormatLayoutMap[] = {
{ { CL_R, CL_UNORM_INT8 }, HSA_IMAGE_FMT_R8_UNORM },
{ { CL_R, CL_UNORM_INT16}, HSA_IMAGE_FMT_R16_UNORM },
{ { CL_R, CL_SNORM_INT8 }, HSA_IMAGE_FMT_R8_SNORM },
{ { CL_R, CL_SNORM_INT16}, HSA_IMAGE_FMT_R16_SNORM },
{ { CL_R, CL_SIGNED_INT8}, HSA_IMAGE_FMT_R8_SINT },
{ { CL_R, CL_SIGNED_INT16}, HSA_IMAGE_FMT_R16_SINT},
{ { CL_R, CL_SIGNED_INT32}, HSA_IMAGE_FMT_R32_SINT},
{ { CL_R, CL_UNSIGNED_INT8},HSA_IMAGE_FMT_R8_UINT },
{ { CL_R, CL_UNSIGNED_INT16}, HSA_IMAGE_FMT_R16_UINT},
{ { CL_R, CL_UNSIGNED_INT32}, HSA_IMAGE_FMT_R32_UINT},
{ { CL_R, CL_HALF_FLOAT}, HSA_IMAGE_FMT_R_HALFFLOAT},
{ { CL_R, CL_FLOAT }, HSA_IMAGE_FMT_R_FLOAT},
{ { CL_A, CL_UNORM_INT8 }, HSA_IMAGE_FMT_A8_UNORM},
{ { CL_A, CL_UNORM_INT16 }, HSA_IMAGE_FMT_A16_UNORM},
{ { CL_A, CL_SNORM_INT8 }, HSA_IMAGE_FMT_A8_SNORM},
{ { CL_A, CL_SNORM_INT16 }, HSA_IMAGE_FMT_A16_SNORM},
{ { CL_A, CL_SIGNED_INT8 }, HSA_IMAGE_FMT_A8_SINT},
{ { CL_A, CL_SIGNED_INT16 },HSA_IMAGE_FMT_A16_SINT},
{ { CL_A, CL_SIGNED_INT32}, HSA_IMAGE_FMT_A32_SINT},
{ { CL_A, CL_UNSIGNED_INT8 },HSA_IMAGE_FMT_A8_UINT},
{ { CL_A, CL_UNSIGNED_INT16}, HSA_IMAGE_FMT_A16_UINT},
{ { CL_A, CL_UNSIGNED_INT32}, HSA_IMAGE_FMT_A32_UINT},
{ { CL_A, CL_HALF_FLOAT}, HSA_IMAGE_FMT_A_HALFFLOAT},
{ { CL_A, CL_FLOAT}, HSA_IMAGE_FMT_A_FLOAT},
{ { CL_RG,CL_UNORM_INT8}, HSA_IMAGE_FMT_R8G8_UNORM},
{ { CL_RG,CL_UNORM_INT16},HSA_IMAGE_FMT_R16G16_UNORM},
{ { CL_RG,CL_SNORM_INT8}, HSA_IMAGE_FMT_R8G8_SNORM},
{ { CL_RG,CL_SNORM_INT16},HSA_IMAGE_FMT_R16G16_SNORM},
{ { CL_RG,CL_SIGNED_INT8},HSA_IMAGE_FMT_R8G8_SINT},
{ { CL_RG,CL_SIGNED_INT16},HSA_IMAGE_FMT_R16G16_SINT},
{ { CL_RG,CL_SIGNED_INT32},HSA_IMAGE_FMT_R32G32_SINT},
{ { CL_RG,CL_UNSIGNED_INT8},HSA_IMAGE_FMT_R8G8_UINT},
{ { CL_RG,CL_UNSIGNED_INT16},HSA_IMAGE_FMT_R16G16_UINT},
{ { CL_RG,CL_UNSIGNED_INT32},HSA_IMAGE_FMT_R32G32_UINT},
{ { CL_RG,CL_HALF_FLOAT},HSA_IMAGE_FMT_RG_HALFFLOAT},
{ { CL_RG,CL_FLOAT},HSA_IMAGE_FMT_RG_FLOAT},
{ { CL_RA,CL_UNORM_INT8}, HSA_IMAGE_FMT_R8A8_UNORM},
{ { CL_RA,CL_UNORM_INT16},HSA_IMAGE_FMT_R16A16_UNORM},
{ { CL_RA,CL_SNORM_INT8}, HSA_IMAGE_FMT_R8A8_SNORM},
{ { CL_RA,CL_SNORM_INT16},HSA_IMAGE_FMT_R16A16_SNORM},
{ { CL_RA,CL_SIGNED_INT8},HSA_IMAGE_FMT_R8A8_SINT},
{ { CL_RA,CL_SIGNED_INT16},HSA_IMAGE_FMT_R16A16_SINT},
{ { CL_RA,CL_SIGNED_INT32},HSA_IMAGE_FMT_R32A32_SINT},
{ { CL_RA,CL_UNSIGNED_INT8},HSA_IMAGE_FMT_R8A8_UINT},
{ { CL_RA,CL_UNSIGNED_INT16},HSA_IMAGE_FMT_R16A16_UINT},
{ { CL_RA,CL_UNSIGNED_INT32},HSA_IMAGE_FMT_R32A32_UINT},
{ { CL_RA,CL_HALF_FLOAT},HSA_IMAGE_FMT_RA_HALFFLOAT},
{ { CL_RA,CL_FLOAT},HSA_IMAGE_FMT_RA_FLOAT},
{ { CL_RGBA,CL_UNORM_INT8}, HSA_IMAGE_FMT_R8G8B8A8_UNORM},
{ { CL_RGBA,CL_UNORM_INT16},HSA_IMAGE_FMT_R16G16B16A16_UNORM},
{ { CL_RGBA,CL_SNORM_INT8}, HSA_IMAGE_FMT_R8G8B8A8_SNORM},
{ { CL_RGBA,CL_SNORM_INT16},HSA_IMAGE_FMT_R16G16B16A16_SNORM},
{ { CL_RGBA,CL_SIGNED_INT8},HSA_IMAGE_FMT_R8G8B8A8_SINT},
{ { CL_RGBA,CL_SIGNED_INT16},HSA_IMAGE_FMT_R16G16B16A16_SINT},
{ { CL_RGBA,CL_SIGNED_INT32},HSA_IMAGE_FMT_R32G32B32A32_SINT},
{ { CL_RGBA,CL_UNSIGNED_INT8},HSA_IMAGE_FMT_R8G8B8A8_UINT},
{ { CL_RGBA,CL_UNSIGNED_INT16},HSA_IMAGE_FMT_R16G16B16A16_UINT},
{ { CL_RGBA,CL_UNSIGNED_INT32},HSA_IMAGE_FMT_R32G32B32A32_UINT},
{ { CL_RGBA,CL_HALF_FLOAT},HSA_IMAGE_FMT_RGBA_HALFFLOAT},
{ { CL_RGBA,CL_FLOAT},HSA_IMAGE_FMT_RGBA_FLOAT},
{ { CL_ARGB,CL_UNORM_INT8},HSA_IMAGE_FMT_A8R8G8B8_UNORM},
{ { CL_ARGB,CL_SNORM_INT8},HSA_IMAGE_FMT_A8R8G8B8_SNORM},
{ { CL_ARGB,CL_SIGNED_INT8},HSA_IMAGE_FMT_A8R8G8B8_SINT},
{ { CL_ARGB,CL_UNSIGNED_INT8},HSA_IMAGE_FMT_A8R8G8B8_UINT},
{ { CL_BGRA,CL_UNORM_INT8},HSA_IMAGE_FMT_B8G8R8A8_UNORM},
{ { CL_BGRA,CL_SNORM_INT8},HSA_IMAGE_FMT_B8G8R8A8_SNORM},
{ { CL_BGRA,CL_SIGNED_INT8},HSA_IMAGE_FMT_B8G8R8A8_SINT},
{ { CL_BGRA,CL_UNSIGNED_INT8},HSA_IMAGE_FMT_B8G8R8A8_UINT},
{ {CL_LUMINANCE,CL_SNORM_INT8}, HSA_IMAGE_FMT_L8_SNORM},
{ {CL_LUMINANCE,CL_SNORM_INT16},HSA_IMAGE_FMT_L16_SNORM},
{ {CL_LUMINANCE,CL_UNORM_INT8},HSA_IMAGE_FMT_L8_UNORM},
{ {CL_LUMINANCE,CL_UNORM_INT16},HSA_IMAGE_FMT_L16_UNORM},
{ {CL_LUMINANCE,CL_HALF_FLOAT},HSA_IMAGE_FMT_L_HALFFLOAT},
{ {CL_LUMINANCE,CL_FLOAT},HSA_IMAGE_FMT_L_FLOAT},
{ {CL_INTENSITY,CL_SNORM_INT8}, HSA_IMAGE_FMT_I8_SNORM},
{ {CL_INTENSITY,CL_SNORM_INT16},HSA_IMAGE_FMT_I16_SNORM},
{ {CL_INTENSITY,CL_UNORM_INT8},HSA_IMAGE_FMT_I8_UNORM},
{ {CL_INTENSITY,CL_UNORM_INT16},HSA_IMAGE_FMT_I16_UNORM},
{ {CL_INTENSITY,CL_HALF_FLOAT},HSA_IMAGE_FMT_I_HALFFLOAT},
{ {CL_INTENSITY,CL_FLOAT},HSA_IMAGE_FMT_I_FLOAT},
{ {CL_RGB, CL_UNORM_SHORT_565},HSA_IMAGE_FMT_R5G6B5_UNORM},
{ {CL_RGB, CL_UNORM_SHORT_555},HSA_IMAGE_FMT_R5G5B5_UNORM},
{ {CL_RGB, CL_UNORM_INT_101010},HSA_IMAGE_FMT_R10G10B10_UNORM}
};
void
Image::populateImageDescriptor()
{
amd::Image* image = owner()->asImage();
// build HSA runtime image descriptor
imageDescriptor_.width = image->getWidth();
imageDescriptor_.height = image->getHeight();
imageDescriptor_.depth = image->getDepth();
imageDescriptor_.arraySize = 0;
// Device specific image does not require rowpitch/slicepitch information.
// Only image buffer is required to specify rowpitch size.
imageDescriptor_.rowPitchInBytes = 0;
imageDescriptor_.slicePitchInBytes = 0;
switch (image->getType())
{
case CL_MEM_OBJECT_IMAGE1D:
imageDescriptor_.geometry = HSA_GEOMETRY_1D;
imageDescriptor_.height = 1;
imageDescriptor_.depth = 1;
break;
case CL_MEM_OBJECT_IMAGE1D_BUFFER:
imageDescriptor_.geometry = HSA_GEOMETRY_1DBuffer;
imageDescriptor_.height = 1;
imageDescriptor_.depth = 1;
break;
case CL_MEM_OBJECT_IMAGE1D_ARRAY:
//@todo - arraySize = height ?!
imageDescriptor_.geometry = HSA_GEOMETRY_1DArray;
imageDescriptor_. height = 1;
imageDescriptor_.arraySize = image->getHeight();
break;
case CL_MEM_OBJECT_IMAGE2D:
imageDescriptor_.geometry = HSA_GEOMETRY_2D;
imageDescriptor_.depth = 1;
break;
case CL_MEM_OBJECT_IMAGE2D_ARRAY:
//@todo - arraySize = depth ?!
imageDescriptor_.geometry = HSA_GEOMETRY_2DArray;
imageDescriptor_.depth = 1;
imageDescriptor_.arraySize = image->getDepth();
break;
case CL_MEM_OBJECT_IMAGE3D:
imageDescriptor_.geometry = HSA_GEOMETRY_3D;
break;
}
for (uint i = 0; i < sizeof(ImageFormatLayoutMap) / sizeof(ImageFormatLayout); ++i) {
if ((image->getImageFormat().image_channel_data_type ==
ImageFormatLayoutMap[i].clFormat.image_channel_data_type) &&
(image->getImageFormat().image_channel_order ==
ImageFormatLayoutMap[i].clFormat.image_channel_order)) {
imageDescriptor_.format = ImageFormatLayoutMap[i].hsaFormat;
}
}
}
bool Image::createInterop() {
amd::ScopedLock lock(owner()->lockMemoryOps());
amd::InteropObject *interopObject = owner()->getInteropObj();
void *hsaImageObjectInterop = NULL;
size_t hsaImageObjectInteropSize = 0;
#ifdef _WIN32
if (interopObject->asD3D10Object()) {
amd::D3D10Object *d3d10Object = interopObject->asD3D10Object();
// 1. Get the D3D11 resource
ID3D10Resource *resource = d3d10Object->getD3D10Resource();
HsaStatus status = hsacoreapi->HsaMapD3D10Texture(
dev_.getBackendDevice(), resource, &hsaImageObjectInterop,
&hsaImageObjectInteropSize, kHsaMapFlagsReadWrite);
if (status != kHsaStatusSuccess || hsaImageObjectInteropSize == 0 ) {
LogError("[OCL] Fail on HsaMapD3D10Texture");
return false;
}
interopType_ = InteropD3D10;
d3d10Resource_ = resource;
}
if (interopObject->asD3D11Object()) {
amd::D3D11Object *d3d11Object = interopObject->asD3D11Object();
// 1. Get the D3D11 resource
ID3D11Resource *resource = d3d11Object->getD3D11Resource();
HsaStatus status = hsacoreapi->HsaMapD3D11Texture(
dev_.getBackendDevice(), resource, &hsaImageObjectInterop,
&hsaImageObjectInteropSize, kHsaMapFlagsReadWrite,
d3d11Object->getPlane());
if (status != kHsaStatusSuccess || hsaImageObjectInteropSize == 0 ) {
LogError("[OCL] Fail on HsaMapD3D11Texture");
return false;
}
interopType_ = InteropD3D11;
d3d11Resource_ = resource;
}
#endif
if (interopObject->asGLObject()) {
amd::GLObject* gl_object = interopObject->asGLObject();
HsaGLResource gl_resource = {0};
gl_resource.name = gl_object->getGLName();
if (gl_object->getGLTarget() != GL_TEXTURE_CUBE_MAP) {
gl_resource.type = gl_object->getGLTarget();
}
else {
gl_resource.type = gl_object->getCubemapFace();
}
gl_resource.mipmap_level = gl_object->getGLMipLevel();
void * glContext =owner()->getContext().info().hCtx_;
// Get the texture SRD.
HsaStatus status = hsacoreapi->HsaMapGLTexture(
dev_.getBackendDevice(), glContext, &gl_resource,
&hsaImageObjectInterop, &hsaImageObjectInteropSize);
if (status != kHsaStatusSuccess || hsaImageObjectInteropSize == 0) {
LogError("[OCL] Fail on HsaMapGLTexture");
return false;
}
status = hsacoreapi->HsaAcquireGLResources( dev_.getBackendDevice(),
glContext,
&gl_resource,
1);
if (status != kHsaStatusSuccess) {
LogError("[OCL] Fail on HsaAcquireGLResources");
return false;
}
// Get the flat address for texture buffer.
if (owner()->getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER) {
// Map the texture buffer resource as buffer.
HsaStatus status = hsacoreapi->HsaMapGLBuffer(
dev_.getBackendDevice(), glContext, &gl_resource,
&deviceMemory_);
if (status != kHsaStatusSuccess) {
LogError("[OCL] Fail on HsaMapGLBuffer");
return false;
}
// Sanity check.
assert((deviceMemory_ != NULL) &&
"deviceMemory_ should not be \
NULL upon successful return from HsaMapGLBuffer");
}
interopType_ = InteropGL;
glResource_ = gl_resource;
}
// Populate HSA specific information to the interop image object.
HsaStatus status = hsacoreapi->HsaAmdCreateDeviceImageView(
&imageDescriptor_, hsaImageObjectInterop, hsaImageObject_);
if (status != kHsaStatusSuccess) {
LogError("[OCL] Fail to tranform interop image SRD");
return false;
}
return true;
}
bool Image::create()
{
if (owner()->parent()) {
// Image view creation
oclhsa::Image *parentImage =
static_cast<oclhsa::Image *>(owner()->parent()->getDeviceMemory(dev_));
if (parentImage == NULL) {
LogError("[OCL] Fail to allocate parent image");
return false;
}
return createView(*parentImage);
}
amd::ScopedLock lock(owner()->lockMemoryOps());
// Get memory size requirement for device specific image.
HsaStatus status = hsacoreapi->HsaGetDeviceImageInfo(
dev_.getBackendDevice(), &imageDescriptor_,
&deviceImageInfo_);
if (status != kHsaStatusSuccess) {
LogError("[OCL] Fail to allocate image memory");
return false;
}
if (dev_.settings().enableLocalMemory_) {
status = hsacoreapi->HsaAllocateDeviceMemory(
deviceImageInfo_.imageSizeInBytes,
deviceImageInfo_.imageAlignmentInBytes,
dev_.getBackendDevice(),
&deviceMemory_);
} else {
status = servicesapi->HsaAllocateSystemMemory(
deviceImageInfo_.imageSizeInBytes,
deviceImageInfo_.imageAlignmentInBytes,
kHsaSystemMemoryTypeDefault,
&deviceMemory_);
}
if (status != kHsaStatusSuccess) {
LogError("[OCL] Fail to allocate image memory");
return false;
}
assert(amd::isMultipleOf(
deviceMemory_, deviceImageInfo_.imageAlignmentInBytes));
status = hsacoreapi->HsaCreateDeviceImage(
dev_.getBackendDevice(), &imageDescriptor_,
deviceMemory_, &hsaImageObject_[0]);
return true;
}
bool
Image::createView(Image &parent)
{
amd::ScopedLock lock(owner()->lockMemoryOps());
if (parent.owner()->asBuffer()) {
// Get new texture SRD since parent is a buffer.
deviceMemory_ = parent.getDeviceMemory();
// Force device specific image implementation to use rowpitch size.
amd::Image* image = owner()->asImage();
imageDescriptor_.rowPitchInBytes = image->getRowPitch();
HsaStatus status = hsacoreapi->HsaCreateDeviceImage(
dev_.getBackendDevice(), &imageDescriptor_,
deviceMemory_, &hsaImageObject_[0]);
if (status != kHsaStatusSuccess) {
LogError("[OCL] Fail to create HSA image object");
return false;
}
} else {
// Get the view of the existing parent's SRD based on the child's image
// descriptor.
HsaStatus status = hsacoreapi->HsaAmdCreateDeviceImageView(
&imageDescriptor_, parent.getHsaImageObjectAddress(),
&hsaImageObject_[0]);
if (status != kHsaStatusSuccess) {
LogError("[OCL] Fail to get view of parent image");
return false;
}
}
return true;
}
void* Image::allocMapTarget(const amd::Coord3D& origin,
const amd::Coord3D& region,
uint mapFlags,
size_t* rowPitch,
size_t* slicePitch)
{
amd::ScopedLock lock(owner()->lockMemoryOps());
incIndMapCount();
void* pHostMem = owner()->getHostMem();
if (pHostMem == NULL) {
if (indirectMapCount_ == 1) {
if (!allocateMapMemory(owner()->getSize())) {
decIndMapCount();
return NULL;
}
}
else {
// Did the map resource allocation fail?
if (mapMemory_ == NULL) {
LogError("Could not map target resource");
return NULL;
}
}
pHostMem = mapMemory_->getHostMem();
}
amd::Image* image = owner()->asImage();
size_t elementSize = image->getImageFormat().getElementSize();
size_t offset = origin[0] * elementSize;
// Adjust offset with Y dimension
offset += image->getRowPitch() * origin[1];
// Adjust offset with Z dimension
offset += image->getSlicePitch() * origin[2];
*rowPitch = image->getRowPitch();
if (slicePitch != NULL)
*slicePitch = image->getSlicePitch();
return (static_cast<uint8_t*>(pHostMem) + offset);
}
Image::~Image()
{
destroy();
}
void
Image::destroy()
{
if (owner()->parent() != NULL) {
return;
}
if (owner()->isInterop()) {
destroyInterop();
return;
}
if (dev_.settings().enableLocalMemory_) {
hsacoreapi->HsaFreeDeviceMemory(deviceMemory_);
}
else {
servicesapi->HsaFreeSystemMemory(deviceMemory_);
}
}
}
#endif // WITHOUT_FSA_BACKEND