Files
rocm-systems/rocclr/runtime/device/gpu/gpumemory.cpp
T
foreman 8b359d4f5a P4 to Git Change 1113301 by gandryey@gera-dev-w7 on 2015/01/20 12:25:37
ECR #304775 - Add remote alloc check for direct host access optimization.
	- If runtime forces allocations into remote memory, then AHP check won't cover normal allocations.

Affected files ...

... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpumemory.cpp#115 edit
2015-01-20 12:31:31 -05:00

1381 строка
43 KiB
C++

//! Implementation of GPU device memory management
#include "top.hpp"
#include "thread/thread.hpp"
#include "thread/monitor.hpp"
#include "device/device.hpp"
#include "device/gpu/gpudevice.hpp"
#include "device/gpu/gpublit.hpp"
#ifdef _WIN32
#include <d3d10_1.h>
#include "amdocl/cl_d3d9_amd.hpp"
#include "amdocl/cl_d3d10_amd.hpp"
#include "amdocl/cl_d3d11_amd.hpp"
#endif //_WIN32
#include "amdocl/cl_gl_amd.hpp"
#include <string>
#include <fstream>
#include <sstream>
#include <iostream>
//! Turn this on to enable sanity checks before and after every heap operation.
#if DEBUG
#define EXTRA_HEAP_CHECKS 1
#endif // DEBUG
namespace gpu {
Memory::Memory(
const Device& gpuDev,
amd::Memory& owner,
HeapBlock* hb,
size_t size)
: device::Memory(owner)
, Resource(gpuDev, ((hb) ? hb->size_ : size) / Heap::ElementSize, Heap::ElementType)
, hb_(hb)
{
init();
if (NULL != hb_) hb_->setMemory(this);
if (owner.parent() != NULL) {
flags_ |= SubMemoryObject;
}
}
Memory::Memory(
const Device& gpuDev,
HeapBlock& hb)
: device::Memory(hb.size_)
, Resource(gpuDev, hb.size_ / Heap::ElementSize, Heap::ElementType)
, hb_(&hb)
{
init();
hb.setMemory(this);
}
Memory::Memory(
const Device& gpuDev,
size_t size)
: device::Memory(size)
, Resource(gpuDev,
amd::alignUp(size, Heap::ElementSize) / Heap::ElementSize, Heap::ElementType)
, hb_(NULL)
{
init();
}
Memory::Memory(
const Device& gpuDev,
amd::Memory& owner,
size_t width,
cmSurfFmt format
)
: device::Memory(owner)
, Resource(gpuDev, width, format)
, hb_(NULL)
{
init();
if (owner.parent() != NULL) {
flags_ |= SubMemoryObject;
}
}
Memory::Memory(
const Device& gpuDev,
size_t size,
size_t width,
cmSurfFmt format
)
: device::Memory(size)
, Resource(gpuDev, width, format)
, hb_(NULL)
{
init();
}
Memory::Memory(
const Device& gpuDev,
amd::Memory& owner,
size_t width,
size_t height,
size_t depth,
cmSurfFmt format,
gslChannelOrder chOrder,
cl_mem_object_type imageType
)
: device::Memory(owner)
, Resource(gpuDev, width, height, depth, format, chOrder, imageType)
, hb_(NULL)
{
init();
if (owner.parent() != NULL) {
flags_ |= SubMemoryObject;
}
}
Memory::Memory(
const Device& gpuDev,
size_t size,
size_t width,
size_t height,
size_t depth,
cmSurfFmt format,
gslChannelOrder chOrder,
cl_mem_object_type imageType
)
: device::Memory(size)
, Resource(gpuDev, width, height, depth, format, chOrder, imageType)
, hb_(NULL)
{
init();
}
void
Memory::init()
{
indirectMapCount_ = 0;
interopType_ = InteropNone;
interopMemory_ = NULL;
pinnedMemory_ = NULL;
parent_ = NULL;
}
#ifdef _WIN32
static HANDLE
getSharedHandle(IUnknown* pIface)
{
// Sanity checks
assert(pIface != NULL);
HRESULT hRes;
HANDLE hShared;
IDXGIResource* pDxgiRes = NULL;
if((hRes = (const_cast<IUnknown*>(pIface))->QueryInterface(
__uuidof(IDXGIResource),
(void**) &pDxgiRes)) != S_OK) {
return (HANDLE) 0;
}
if(!pDxgiRes) {
return (HANDLE) 0;
}
hRes = pDxgiRes->GetSharedHandle(&hShared);
pDxgiRes->Release();
if(hRes != S_OK) {
return (HANDLE) 0;
}
return hShared;
}
#endif //_WIN32
bool
Memory::create(
Resource::MemoryType memType,
Resource::CreateParams* params)
{
bool result;
// Reset the flag in case we reallocate the heap in local/remote
flags_ &= ~HostMemoryDirectAccess;
// Create a resource in CAL
result = Resource::create(memType, params);
// Check if CAL created a resource
if (result) {
switch (memoryType()) {
case Resource::Pinned:
case Resource::ExternalPhysical:
// Marks memory object for direct GPU access to the host memory
flags_ |= HostMemoryDirectAccess;
break;
case Resource::Remote:
case Resource::RemoteUSWC:
// @todo Enable unconditional optimization for remote memory
if ((owner() != NULL &&
owner()->getMemFlags() & CL_MEM_ALLOC_HOST_PTR) ||
(hb() == NULL)) {
if (!cal()->tiled_) {
// Marks memory object for direct GPU access to the host memory
flags_ |= HostMemoryDirectAccess;
}
}
break;
case Resource::View: {
Resource::ViewParams* view =
reinterpret_cast<Resource::ViewParams*>(params);
// Check if parent was allocated in system memory
if ((view->resource_->memoryType() == Resource::Pinned) ||
(((view->resource_->memoryType() == Resource::Remote) ||
(view->resource_->memoryType() == Resource::RemoteUSWC)) &&
// @todo Enable unconditional optimization for remote memory
// Check for external allocation, to avoid the optimization
// for non-VM (double copy) mode
(owner() != NULL) &&
((owner()->getMemFlags() & CL_MEM_ALLOC_HOST_PTR) ||
dev().settings().remoteAlloc_))) {
// Marks memory object for direct GPU access to the host memory
flags_ |= HostMemoryDirectAccess;
}
if ((view->owner_ != NULL) && (view->owner_->parent() != NULL)) {
parent_ = reinterpret_cast<const Memory*>(view->memory_);
flags_ |= SubMemoryObject;
}
break;
}
case Resource::ImageView: {
Resource::ImageViewParams* view =
reinterpret_cast<Resource::ImageViewParams*>(params);
parent_ = reinterpret_cast<const Memory*>(view->memory_);
flags_ |= SubMemoryObject;
break;
}
case Resource::ImageBuffer: {
Resource::ImageBufferParams* view =
reinterpret_cast<Resource::ImageBufferParams*>(params);
parent_ = reinterpret_cast<const Memory*>(view->memory_);
flags_ |= SubMemoryObject;
break;
}
default:
break;
}
}
return result;
}
bool Memory::processGLResource(GLResourceOP operation)
{
bool retVal = false;
switch (operation)
{
case GLDecompressResource:
retVal = gslGLAcquire();
break;
case GLInvalidateFBO:
retVal = gslGLRelease();
break;
default:
assert(false && "unknown GLResourceOP");
}
return retVal;
}
bool
Memory::createInterop(InteropType type)
{
Resource::MemoryType memType = Resource::Empty;
Resource::OGLInteropParams oglRes;
#ifdef _WIN32
Resource::D3DInteropParams d3dRes;
#endif //_WIN32
// Only external objects support interop
assert(owner() != NULL);
Resource::CreateParams* createParams = NULL;
amd::InteropObject* interop = owner()->getInteropObj();
assert((interop != NULL) && "An invalid interop object is impossible!");
amd::GLObject* glObject = interop->asGLObject();
#ifdef _WIN32
amd::D3D10Object* d3d10Object = interop->asD3D10Object();
amd::D3D11Object* d3d11Object = interop->asD3D11Object();
amd::D3D9Object* d3d9Object = interop->asD3D9Object();
if (d3d10Object != NULL) {
createParams = &d3dRes;
d3dRes.owner_ = owner();
const amd::D3D10ObjDesc_t* objDesc = d3d10Object->getObjDesc();
memType = Resource::D3D10Interop;
// Get shared handle
if ((d3dRes.handle_ =
getSharedHandle(d3d10Object->getD3D10Resource()))) {
d3dRes.iDirect3D_ = static_cast<void*>
(d3d10Object->getD3D10Resource());
d3dRes.type_ = Resource::InteropTypeless;
}
d3dRes.misc = 0;
// Find D3D10 object type
switch (objDesc->objDim_) {
case D3D10_RESOURCE_DIMENSION_BUFFER:
d3dRes.type_ = Resource::InteropVertexBuffer;
break;
case D3D10_RESOURCE_DIMENSION_TEXTURE1D:
case D3D10_RESOURCE_DIMENSION_TEXTURE2D:
case D3D10_RESOURCE_DIMENSION_TEXTURE3D:
d3dRes.type_ = Resource::InteropTexture;
if (objDesc->mipLevels_ > 1) {
d3dRes.type_ = Resource::InteropTextureViewLevel;
if (objDesc->arraySize_ > 1) {
d3dRes.layer_ = d3d10Object->getSubresource() /
objDesc->mipLevels_;
d3dRes.mipLevel_ = d3d10Object->getSubresource() %
objDesc->mipLevels_;
}
else {
d3dRes.layer_ = 0;
d3dRes.mipLevel_ = d3d10Object->getSubresource();
}
}
break;
default:
return false;
break;
}
}
else if (d3d11Object != NULL) {
createParams = &d3dRes;
d3dRes.owner_ = owner();
const amd::D3D11ObjDesc_t* objDesc = d3d11Object->getObjDesc();
memType = Resource::D3D11Interop;
// Get shared handle
if ((d3dRes.handle_ =
getSharedHandle(d3d11Object->getD3D11Resource()))) {
d3dRes.iDirect3D_ = static_cast<void*>
(d3d11Object->getD3D11Resource());
d3dRes.type_ = Resource::InteropTypeless;
}
d3dRes.misc = 0;
// Find D3D11 object type
switch (objDesc->objDim_) {
case D3D11_RESOURCE_DIMENSION_BUFFER:
d3dRes.type_ = Resource::InteropVertexBuffer;
break;
case D3D11_RESOURCE_DIMENSION_TEXTURE1D:
case D3D11_RESOURCE_DIMENSION_TEXTURE2D:
case D3D11_RESOURCE_DIMENSION_TEXTURE3D:
d3dRes.type_ = Resource::InteropTexture;
d3dRes.layer_= d3d11Object->getPlane();
d3dRes.misc = d3d11Object->getMiscFlag();
if (objDesc->mipLevels_ > 1) {
d3dRes.type_ = Resource::InteropTextureViewLevel;
if (objDesc->arraySize_ > 1) {
d3dRes.layer_ = d3d11Object->getSubresource() /
objDesc->mipLevels_;
d3dRes.mipLevel_ = d3d11Object->getSubresource() %
objDesc->mipLevels_;
}
else {
d3dRes.layer_ = 0;
d3dRes.mipLevel_ = d3d11Object->getSubresource();
}
}
break;
default:
return false;
break;
}
}
else if (d3d9Object != NULL) {
createParams = &d3dRes;
d3dRes.owner_ = owner();
const amd::D3D9ObjDesc_t* objDesc = d3d9Object->getObjDesc();
memType = Resource::D3D9Interop;
// Get shared handle
if ((d3dRes.handle_ = d3d9Object->getD3D9SharedHandle())) {
d3dRes.iDirect3D_ = static_cast<void*>
(d3d9Object->getD3D9Resource());
d3dRes.type_ = Resource::InteropSurface;
d3dRes.mipLevel_ = 0;
d3dRes.layer_ = d3d9Object->getPlane();
d3dRes.misc = d3d9Object->getMiscFlag();
}
}
else
#endif //_WIN32
if (glObject != NULL) {
createParams = &oglRes;
oglRes.owner_ = owner();
memType = Resource::OGLInterop;
// Fill the interop creation parameters
oglRes.handle_ = static_cast<CALuint>(glObject->getGLName());
// Find OGL object type
switch (glObject->getCLGLObjectType()) {
case CL_GL_OBJECT_BUFFER:
oglRes.type_ = Resource::InteropVertexBuffer;
break;
case CL_GL_OBJECT_TEXTURE_BUFFER:
case CL_GL_OBJECT_TEXTURE1D:
case CL_GL_OBJECT_TEXTURE1D_ARRAY:
case CL_GL_OBJECT_TEXTURE2D:
case CL_GL_OBJECT_TEXTURE2D_ARRAY:
case CL_GL_OBJECT_TEXTURE3D:
oglRes.type_ = Resource::InteropTexture;
if (GL_TEXTURE_CUBE_MAP == glObject->getGLTarget()) {
switch (glObject->getCubemapFace()) {
case GL_TEXTURE_CUBE_MAP_POSITIVE_X:
case GL_TEXTURE_CUBE_MAP_NEGATIVE_X:
case GL_TEXTURE_CUBE_MAP_POSITIVE_Y:
case GL_TEXTURE_CUBE_MAP_NEGATIVE_Y:
case GL_TEXTURE_CUBE_MAP_POSITIVE_Z:
case GL_TEXTURE_CUBE_MAP_NEGATIVE_Z:
oglRes.type_ = Resource::InteropTextureViewCube;
oglRes.layer_ =
glObject->getCubemapFace() - GL_TEXTURE_CUBE_MAP_POSITIVE_X;
oglRes.mipLevel_ = glObject->getGLMipLevel();
break;
default:
break;
}
}
else if (glObject->getGLMipLevel() != 0) {
oglRes.type_ = Resource::InteropTextureViewLevel;
oglRes.layer_ = 0;
oglRes.mipLevel_ = glObject->getGLMipLevel();
}
break;
case CL_GL_OBJECT_RENDERBUFFER:
oglRes.type_ = Resource::InteropRenderBuffer;
break;
default:
return false;
break;
}
}
else {
return false;
}
oglRes.glPlatformContext_ = owner()->getContext().info().hCtx_;
oglRes.glDeviceContext_ = owner()->getContext().info().hDev_;
// We dont pass any flags here for the GL Resource.
oglRes.flags_ = 0;
// Get the interop settings
if (type == InteropDirectAccess) {
// Create memory object
if (!create(memType, createParams)) {
return false;
}
}
else {
// Allocate Resource object for interop as buffer
interopMemory_ = new Memory(dev(), size(),
amd::alignUp(size(), Heap::ElementSize) / Heap::ElementSize,
Heap::ElementType);
// Create the interop object in CAL
if (NULL == interopMemory_ || !interopMemory_->create(memType, createParams)) {
delete interopMemory_;
interopMemory_ = NULL;
return false;
}
}
setInteropType(type);
return true;
}
Memory::~Memory()
{
// Clean VA cache
dev().removeVACache(this);
// Release associated heap block, if any
if (hb_) {
// Protect heap block from simultaneous release with realloc
amd::ScopedLock k(dev().lockAsyncOps());
hb_->setMemory(NULL);
hb_->free();
}
delete interopMemory_;
// Release associated map target, if any
if (NULL != mapMemory_) {
mapMemory()->unmap(NULL);
mapMemory_->release();
}
// Destory pinned memory
if (flags_ & PinnedMemoryAlloced) {
delete pinnedMemory_;
}
if ((owner() != NULL) && isHostMemDirectAccess() &&
!(flags_ & SubMemoryObject) &&
(memoryType() != Resource::ExternalPhysical)) {
// Unmap memory if direct access was requested
unmap(NULL);
}
}
bool
Memory::reallocate(HeapBlock* hb, const Resource* parent)
{
Resource::ViewParams params;
params.size_ = hb->size_;
params.resource_ = parent;
params.memory_ = NULL;
// Check if it's a view reallocation
if (NULL != hb->parent_) {
// The offset inside the view is unchanged
params.offset_ = Resource::offset();
// Create a new view
if (Resource::create(Resource::View, &params)) {
hb_ = hb;
return true;
}
}
else {
params.offset_ = hb->offset_;
if (Resource::reallocate(&params)) {
hb_ = hb;
return true;
}
}
return false;
}
void
Memory::syncCacheFromHost(VirtualGPU& gpu, device::Memory::SyncFlags syncFlags)
{
// If the last writer was another GPU, then make a writeback
if (!isHostMemDirectAccess() &&
(owner()->getLastWriter() != NULL) &&
(&dev() != owner()->getLastWriter())) {
mgpuCacheWriteBack();
}
// If host memory doesn't have direct access, then we have to synchronize
if (!isHostMemDirectAccess() && (NULL != owner()->getHostMem())) {
bool hasUpdates = true;
// Make sure the parent of subbuffer is up to date
if (!syncFlags.skipParent_ && (flags_ & SubMemoryObject)) {
gpu::Memory* gpuMemory = dev().getGpuMemory(owner()->parent());
//! \note: Skipping the sync for a view doesn't reflect the parent settings,
//! since a view is a small portion of parent
device::Memory::SyncFlags syncFlagsTmp;
// Sync parent from a view, so views have to be skipped
syncFlagsTmp.skipViews_ = true;
// Make sure the parent sync is an unique operation.
// If the app uses multiple subbuffers from multiple queues,
// then the parent sync can be called from multiple threads
amd::ScopedLock lock(owner()->parent()->lockMemoryOps());
gpuMemory->syncCacheFromHost(gpu, syncFlagsTmp);
//! \note Don't do early exit here, since we still have to sync
//! this view, if the parent sync operation was a NOP.
//! If parent was synchronized, then this view sync will be a NOP
}
// Is this a NOP?
if ((version_ == owner()->getVersion()) ||
(&dev() == owner()->getLastWriter())) {
hasUpdates = false;
}
// Update all available views, since we sync the parent
if ((owner()->subBuffers().size() != 0) &&
(hasUpdates || !syncFlags.skipViews_)) {
std::list<amd::Memory*>::const_iterator it;
device::Memory::SyncFlags syncFlagsTmp;
// Sync views from parent, so parent has to be skipped
syncFlagsTmp.skipParent_ = true;
if (hasUpdates) {
// Parent will be synced so update all views with a skip
syncFlagsTmp.skipEntire_ = true;
}
else {
// Passthrough the skip entire flag to the views, since
// any view is a submemory of the parent
syncFlagsTmp.skipEntire_ = syncFlags.skipEntire_;
}
amd::ScopedLock lock(owner()->lockMemoryOps());
for (it = owner()->subBuffers().begin();
it != owner()->subBuffers().end(); ++it) {
//! \note Don't allow subbuffer's allocation in the worker thread.
//! It may cause a system lock, because possible resource
//! destruction, heap reallocation or subbuffer allocation
static const bool AllocSubBuffer = false;
device::Memory* devSub =
(*it)->getDeviceMemory(dev(), AllocSubBuffer);
if (NULL != devSub) {
gpu::Memory* gpuSub = reinterpret_cast<gpu::Memory*>(devSub);
gpuSub->syncCacheFromHost(gpu, syncFlagsTmp);
}
}
}
// Make sure we didn't have a NOP,
// because this GPU device was the last writer
if (&dev() != owner()->getLastWriter()) {
// Update the latest version
version_ = owner()->getVersion();
}
// Exit if sync is a NOP or sync can be skipped
if (!hasUpdates || syncFlags.skipEntire_) {
return;
}
bool result = false;
static const bool Entire = true;
amd::Coord3D origin(0, 0, 0);
// If host memory was pinned then make a transfer
if (flags_ & PinnedMemoryAlloced) {
if (cal()->buffer_) {
amd::Coord3D region(owner()->getSize());
result = gpu.blitMgr().copyBuffer(*pinnedMemory_,
*this, origin, origin, region, Entire);
}
else {
amd::Image& image = *owner()->asImage();
result = gpu.blitMgr().copyBufferToImage(*pinnedMemory_,
*this, origin, origin, image.getRegion(), Entire,
image.getRowPitch(), image.getSlicePitch());
}
}
if (!result) {
if (cal()->buffer_) {
amd::Coord3D region(owner()->getSize());
result = gpu.blitMgr().writeBuffer(owner()->getHostMem(),
*this, origin, region, Entire);
}
else {
amd::Image& image = *owner()->asImage();
result = gpu.blitMgr().writeImage(owner()->getHostMem(),
*this, origin, image.getRegion(),
image.getRowPitch(), image.getSlicePitch(), Entire);
}
}
//!@todo A wait isn't really necessary. However
//! Linux no-VM may have extra random failures.
wait(gpu);
// Should never fail
assert(result && "Memory synchronization failed!");
}
}
void
Memory::syncHostFromCache(device::Memory::SyncFlags syncFlags)
{
// Sanity checks
assert(owner() != NULL);
// If host memory doesn't have direct access, then we have to synchronize
if (!isHostMemDirectAccess()) {
bool hasUpdates = true;
// Make sure the parent of subbuffer is up to date
if (!syncFlags.skipParent_ && (flags_ & SubMemoryObject)) {
device::Memory* m = owner()->parent()->getDeviceMemory(dev());
//! \note: Skipping the sync for a view doesn't reflect the parent settings,
//! since a view is a small portion of parent
device::Memory::SyncFlags syncFlagsTmp;
// Sync parent from a view, so views have to be skipped
syncFlagsTmp.skipViews_ = true;
// Make sure the parent sync is an unique operation.
// If the app uses multiple subbuffers from multiple queues,
// then the parent sync can be called from multiple threads
amd::ScopedLock lock(owner()->parent()->lockMemoryOps());
m->syncHostFromCache(syncFlagsTmp);
//! \note Don't do early exit here, since we still have to sync
//! this view, if the parent sync operation was a NOP.
//! If parent was synchronized, then this view sync will be a NOP
}
// Is this a NOP?
if ((NULL == owner()->getLastWriter()) ||
(version_ == owner()->getVersion())) {
hasUpdates = false;
}
// Update all available views, since we sync the parent
if ((owner()->subBuffers().size() != 0) &&
(hasUpdates || !syncFlags.skipViews_)) {
std::list<amd::Memory*>::const_iterator it;
device::Memory::SyncFlags syncFlagsTmp;
// Sync views from parent, so parent has to be skipped
syncFlagsTmp.skipParent_ = true;
if (hasUpdates) {
// Parent will be synced so update all views with a skip
syncFlagsTmp.skipEntire_ = true;
}
else {
// Passthrough the skip entire flag to the views, since
// any view is a submemory of the parent
syncFlagsTmp.skipEntire_ = syncFlags.skipEntire_;
}
amd::ScopedLock lock(owner()->lockMemoryOps());
for (it = owner()->subBuffers().begin();
it != owner()->subBuffers().end(); ++it) {
//! \note Don't allow subbuffer's allocation in the worker thread.
//! It may cause a system lock, because possible resource
//! destruction, heap reallocation or subbuffer allocation
static const bool AllocSubBuffer = false;
device::Memory* devSub =
(*it)->getDeviceMemory(dev(), AllocSubBuffer);
if (NULL != devSub) {
gpu::Memory* gpuSub = reinterpret_cast<gpu::Memory*>(devSub);
gpuSub->syncHostFromCache(syncFlagsTmp);
}
}
}
// Make sure we didn't have a NOP,
// because CPU was the last writer
if (NULL != owner()->getLastWriter()) {
// Mark parent as up to date, set our version accordingly
version_ = owner()->getVersion();
}
// Exit if sync is a NOP or sync can be skipped
if (!hasUpdates || syncFlags.skipEntire_) {
return;
}
bool result = false;
static const bool Entire = true;
amd::Coord3D origin(0, 0, 0);
// If backing store was pinned then make a transfer
if (flags_ & PinnedMemoryAlloced) {
if (cal()->buffer_) {
amd::Coord3D region(owner()->getSize());
result = dev().xferMgr().copyBuffer(*this,
*pinnedMemory_, origin, origin, region, Entire);
}
else {
amd::Image& image = *owner()->asImage();
result = dev().xferMgr().copyImageToBuffer(*this,
*pinnedMemory_, origin, origin, image.getRegion(), Entire,
image.getRowPitch(), image.getSlicePitch());
}
}
// Just do a basic host read
if (!result) {
if (cal()->buffer_) {
amd::Coord3D region(owner()->getSize());
result = dev().xferMgr().readBuffer(*this,
owner()->getHostMem(), origin, region, Entire);
}
else {
amd::Image& image = *owner()->asImage();
result = dev().xferMgr().readImage(*this,
owner()->getHostMem(), origin, image.getRegion(),
image.getRowPitch(), image.getSlicePitch(), Entire);
}
}
// Should never fail
assert(result && "Memory synchronization failed!");
}
}
gpu::Memory*
Memory::createBufferView(amd::Memory& subBufferOwner)
{
gpu::Memory* viewMemory;
Resource::ViewParams params;
HeapBlock* hb = NULL;
size_t offset = subBufferOwner.getOrigin();
size_t size = subBufferOwner.getSize();
if (!dev().heap()->isVirtual()) {
if (NULL == hb_) {
LogError("HeapBlock must be initialized!");
return NULL;
}
hb = new HeapBlock(NULL, size, offset + hb_->offset());
if (hb == NULL) {
LogError("We don't have enough video memory!");
return NULL;
}
amd::ScopedLock lock(owner()->lockMemoryOps());
hb_->addView(hb);
}
// Create a memory object
viewMemory = new gpu::Memory(dev(), subBufferOwner, hb, size);
if (NULL == viewMemory) {
if (hb != NULL) {
hb->setMemory(NULL);
hb->free();
}
return NULL;
}
params.owner_ = &subBufferOwner;
params.gpu_ = static_cast<VirtualGPU*>(subBufferOwner.getVirtualDevice());
params.offset_ = offset;
params.size_ = size;
params.resource_ = this;
params.memory_ = this;
if (!viewMemory->create(Resource::View, &params)) {
delete viewMemory;
return NULL;
}
// Explicitly set the host memory location,
// because the parent location could change after reallocation
if (NULL != owner()->getHostMem()) {
subBufferOwner.setHostMem(
reinterpret_cast<char*>(owner()->getHostMem()) + offset);
}
else {
subBufferOwner.setHostMem(NULL);
}
return viewMemory;
}
void
Memory::decIndMapCount()
{
// Map/unmap must be serialized
amd::ScopedLock lock(owner()->lockMemoryOps());
if (indirectMapCount_ == 0) {
LogError("decIndMapCount() called when indirectMapCount_ already zero");
return;
}
// Decrement the counter and release indirect map if it's the last op
if (--indirectMapCount_ == 0) {
if (NULL != mapMemory_) {
amd::Memory* memory = mapMemory_;
amd::Memory* empty = NULL;
// Get GPU memory
Memory* gpuMemory = mapMemory();
gpuMemory->unmap(NULL);
if (!dev().addMapTarget(memory)) {
memory->release();
}
// Map/unamp is serialized for the same memory object,
// so it's safe to clear the pointer
assert((mapMemory_ != NULL) && "Mapped buffer should be valid");
mapMemory_ = NULL;
}
}
}
// Note - must be called by the device under the async lock, so no spinning
// or long pauses allowed in this function.
void*
Memory::allocMapTarget(
const amd::Coord3D& origin,
const amd::Coord3D& region,
uint mapFlags,
size_t* rowPitch,
size_t* slicePitch)
{
// Sanity checks
assert(owner() != NULL);
// Map/unmap must be serialized
amd::ScopedLock lock(owner()->lockMemoryOps());
address mapAddress = NULL;
size_t offset = origin[0];
incIndMapCount();
// If host memory exists, use it
if ((owner()->getHostMem() != NULL) && isDirectMap()) {
mapAddress = reinterpret_cast<address>(owner()->getHostMem());
}
// If resource is a persistent allocation, we can use it directly
else if (isPersistentDirectMap()) {
if (NULL == map(NULL)) {
LogError("Could not map target persistent resource");
decIndMapCount();
return NULL;
}
mapAddress = data();
}
// Otherwise we can use a remote resource:
else {
// Are we in range?
size_t elementCount = cal()->width_;
size_t rSize = elementCount * elementSize();
if (offset >= rSize || offset + region[0] > rSize) {
LogWarning("Memory::allocMapTarget() - offset/size out of bounds");
return NULL;
}
// Allocate a map resource if there isn't any yet
if (indirectMapCount_ == 1) {
const static bool SysMem = true;
bool failed = false;
amd::Memory* memory = NULL;
// Search for a possible indirect resource
//For SVM implementation, we cannot use cached map. if svm space, use the svm host pointer
void *initHostPtr = owner()->getSvmPtr();
cl_mem_flags flag = 0;
bool canBeCached = true;
if (NULL != initHostPtr) {
//make sure the host memory is committed already, or we have a big problem.
assert(owner()->isSvmPtrCommited() && "The host svm memory not committed yet!");
flag = CL_MEM_USE_HOST_PTR;
canBeCached = false;
}
else {
memory = dev().findMapTarget(owner()->getSize());
}
if (memory == NULL) {
// for map target of svm buffer , we need use svm host ptr
memory = new(owner()->getContext())
amd::Buffer(owner()->getContext(), flag, owner()->getSize());
Memory* gpuMemory;
do {
if ((memory == NULL) || !memory->create(initHostPtr, SysMem)) {
failed = true;
break;
}
memory->setCacheStatus(canBeCached);
gpuMemory = reinterpret_cast<Memory*>
(memory->getDeviceMemory(dev()));
// Create, Map and get the base pointer for the resource
if ((gpuMemory == NULL) || (NULL == gpuMemory->map(NULL))) {
failed = true;
break;
}
}
while (false);
}
if (failed) {
if (memory != NULL) {
memory->release();
}
decIndMapCount();
LogError("Could not map target resource");
return NULL;
}
// Map/unamp is serialized for the same memory object,
// so it's safe to assign the new pointer
assert((mapMemory_ == NULL) && "Mapped buffer can't be valid");
mapMemory_ = memory;
}
else {
// Did the map resource allocation fail?
if (mapMemory_ == NULL) {
LogError("Could not map target resource");
return NULL;
}
}
mapAddress = mapMemory()->data();
// Use start of the indirect buffer
offset = 0;
}
return mapAddress + offset;
}
bool
Memory::pinSystemMemory(void* hostPtr, size_t size)
{
bool result = false;
// If memory has a direct access already, then skip the host memory pinning
if (isHostMemDirectAccess()) {
return true;
}
// Destroy the old pinned memory if it was already allocated
if (flags_ & PinnedMemoryAlloced) {
delete pinnedMemory_;
flags_ &= ~PinnedMemoryAlloced;
}
// Allocate memory for the pinned object
pinnedMemory_ = new Memory(dev(), size);
if (pinnedMemory_ == NULL) {
return false;
}
// Check if it's a view
if (flags_ & SubMemoryObject) {
const gpu::Memory* gpuMemory;
if (owner() != NULL) {
gpuMemory = dev().getGpuMemory(owner()->parent());
}
else {
gpuMemory = parent();
}
if (gpuMemory->flags_ & PinnedMemoryAlloced) {
Resource::ViewParams params;
params.owner_ = owner();
params.offset_ = owner()->getOrigin();
params.size_ = owner()->getSize();
params.resource_ = gpuMemory->pinnedMemory_;
params.memory_ = NULL;
result = pinnedMemory_->create(Resource::View, &params);
}
}
else {
Resource::PinnedParams params;
// Fill resource creation parameters
params.owner_ = owner();
params.hostMemRef_ = owner()->getHostMemRef();
params.size_ = size;
// Create resource
result = pinnedMemory_->create(Resource::Pinned, &params);
}
if (!result) {
delete pinnedMemory_;
pinnedMemory_ = NULL;
return false;
}
flags_ |= PinnedMemoryAlloced;
return true;
}
void*
Memory::cpuMap(
device::VirtualDevice& vDev, uint flags,
uint startLayer, uint numLayers,
size_t* rowPitch,
size_t* slicePitch)
{
uint resFlags = 0;
if (flags == Memory::CpuReadOnly) {
resFlags = Resource::ReadOnly;
}
else if (flags == Memory::CpuWriteOnly) {
resFlags = Resource::WriteOnly;
}
void* ptr = map(&static_cast<VirtualGPU&>(vDev), resFlags, startLayer, numLayers);
if (!cal()->buffer_) {
*rowPitch = cal()->pitch_ * elementSize();
*slicePitch = cal()->slice_ * elementSize();
}
return ptr;
}
void
Memory::cpuUnmap(device::VirtualDevice& vDev)
{
unmap(&static_cast<VirtualGPU&>(vDev));
}
//! \note moveTo() must be called only from outside of
//! VirtualGPU submit command methods.
//! Otherwise a deadlock in lockVgpus() is possible.
//! Also the logic in this function is very specific to
//! the zero-copy functionality.
bool
Memory::moveTo(Memory& dst)
{
bool result = false;
// Make sure that all virtual devices don't process any commands
Device::ScopedLockVgpus lock(dev());
// Wait for idle on all virtual GPUs
//!@note It's enough to wait on the active queue only
for (uint idx = 0; idx < dev().vgpus().size(); ++idx) {
wait(*(dev().vgpus()[idx]));
}
static const bool Entire = true;
amd::Coord3D origin(0, 0, 0);
amd::Coord3D region(size());
// Transfer the data from old location to a new one
if (dev().xferMgr().copyBuffer(
*this, dst, origin, origin, region, Entire)) {
// Move all properties to the new object
dst.mapMemory_ = mapMemory_;
mapMemory_ = NULL;
dst.flags_ |= flags_ & ~HostMemoryDirectAccess;
flags_ &= HostMemoryDirectAccess;
dst.indirectMapCount_ = indirectMapCount_;
indirectMapCount_ = 0;
dst.pinnedMemory_ = pinnedMemory_;
pinnedMemory_ = NULL;
// Replace the device memory object
//! @note: current object will be destroyed
owner()->replaceDeviceMemory(&dev(), &dst);
result = true;
}
return result;
}
Memory*
Memory::mapMemory() const
{
Memory* map = NULL;
if (NULL != mapMemory_) {
map = reinterpret_cast<Memory*>(mapMemory_->getDeviceMemory(dev()));
}
return map;
}
void
Memory::mgpuCacheWriteBack()
{
// Lock memory object, so only one write back can occur
amd::ScopedLock lock(owner()->lockMemoryOps());
// Attempt to allocate a staging buffer if don't have any
if (owner()->getHostMem() == NULL) {
static const bool forceAllocHostMem = true;
if (owner()->allocHostMemory(NULL, forceAllocHostMem)) {
//! \note Ignore pinning result
bool ok = pinSystemMemory(
owner()->getHostMem(), owner()->getHostMemRef()->size());
}
}
// Make synchronization
if (owner()->getHostMem() != NULL) {
owner()->cacheWriteBack();
}
}
Memory*
Buffer::createBufferView(amd::Memory& subBufferOwner) const
{
gpu::Memory* subBuffer;
Resource::ViewParams params;
size_t offset = subBufferOwner.getOrigin();
size_t size = subBufferOwner.getSize();
// Create a memory object
subBuffer = new gpu::Buffer(dev(), subBufferOwner, size);
if (NULL == subBuffer) {
return NULL;
}
// Allocate a view for this buffer object
params.owner_ = &subBufferOwner;
params.offset_ = offset;
params.size_ = size;
params.resource_ = this;
params.memory_ = this;
if (!subBuffer->create(Resource::View, &params)) {
delete subBuffer;
return NULL;
}
return subBuffer;
}
void*
Image::allocMapTarget(
const amd::Coord3D& origin,
const amd::Coord3D& region,
uint mapFlags,
size_t* rowPitch,
size_t* slicePitch)
{
// Sanity checks
assert(owner() != NULL);
bool useRemoteResource = true;
size_t slicePitchTmp = 0;
size_t height = cal()->height_;
size_t depth = cal()->depth_;
// Map/unmap must be serialized
amd::ScopedLock lock(owner()->lockMemoryOps());
address mapAddress = NULL;
size_t offset = origin[0];
incIndMapCount();
// If host memory exists, use it
if ((owner()->getHostMem() != NULL) && isDirectMap()) {
useRemoteResource = false;
mapAddress = reinterpret_cast<address>(owner()->getHostMem());
amd::Image* amdImage = owner()->asImage();
// Calculate the offset in bytes
offset *= elementSize();
// Update the row and slice pitches value
*rowPitch = (amdImage->getRowPitch() == 0) ?
(cal()->width_ * elementSize()) : amdImage->getRowPitch();
slicePitchTmp = (amdImage->getSlicePitch() == 0) ?
(height * (*rowPitch)) : amdImage->getSlicePitch();
// Adjust the offset in Y and Z dimensions
offset += origin[1] * (*rowPitch);
offset += origin[2] * slicePitchTmp;
}
// If resource is a persistent allocation, we can use it directly
//! @note Even if resource is a persistent allocation,
//! runtime can't use it directly,
//! because CAL volume map doesn't work properly.
//! @todo arrays can be added for persistent lock with some CAL changes
else if (isPersistentDirectMap()) {
if (NULL == map(NULL)) {
useRemoteResource = true;
LogError("Could not map target persistent resource, try remote resource");
}
else {
useRemoteResource = false;
mapAddress = data();
// Calculate the offset in bytes
offset *= elementSize();
// Update the row pitch value
*rowPitch = cal()->pitch_ * elementSize();
// Adjust the offset in Y dimension
offset += origin[1] * (*rowPitch);
}
}
// Otherwise we can use a remote resource:
if (useRemoteResource) {
// Calculate X offset in bytes
offset *= elementSize();
// Allocate a map resource if there isn't any yet
if (indirectMapCount_ == 1) {
const static bool SysMem = true;
bool failed = false;
amd::Memory* memory;
// Search for a possible indirect resource
memory = dev().findMapTarget(owner()->getSize());
if (memory == NULL) {
// Allocate a new buffer to use as the map target
//! @note Allocate a 1D buffer, since CAL issues with 3D
//! Also HW doesn't support untiled images
memory = new (owner()->getContext())
amd::Buffer(owner()->getContext(), 0,
cal()->width_ * height * depth * elementSize());
Memory* gpuMemory;
do {
if ((memory == NULL) || !memory->create(NULL, SysMem)) {
failed = true;
break;
}
gpuMemory = reinterpret_cast<Memory*>
(memory->getDeviceMemory(dev()));
// Create, Map and get the base pointer for the resource
if ((gpuMemory == NULL) || (NULL == gpuMemory->map(NULL))) {
failed = true;
break;
}
}
while (false);
}
if (failed) {
if (memory != NULL) {
memory->release();
}
decIndMapCount();
LogError("Could not map target resource");
return NULL;
}
// Map/unamp is serialized for the same memory object,
// so it's safe to assign the new pointer
assert((mapMemory_ == NULL) && "Mapped buffer can't be valid");
mapMemory_ = memory;
}
else {
// Did the map resource allocation fail?
if (mapMemory_ == NULL) {
LogError("Could not map target resource");
return NULL;
}
}
mapAddress = mapMemory()->data();
// Update the row and slice pitches value
*rowPitch = region[0] * elementSize();
if (cal()->dimension_ == GSL_MOA_TEXTURE_1D_ARRAY) {
slicePitchTmp = *rowPitch ;
}
else {
slicePitchTmp = *rowPitch * region[1];
}
// Use start of the indirect buffer
offset = 0;
}
if (slicePitch != NULL) {
*slicePitch = slicePitchTmp;
}
return mapAddress + offset;
}
} // namespace gpu