Files
rocm-systems/rocclr/runtime/device/pal/palmemory.cpp
T
foreman c2ca879dec P4 to Git Change 1369302 by todli@todli-win-opencl-kv1 on 2017/02/04 00:49:06
SWDEV-107271 - Fix crash that happens when Resource::create() fails and "owner" gets cleared. This is only triggered by Conformance test Basic progvar_prog_scope_misc on Raven with 16GB memory (8GB is fine), but should be a general bug.
	http://ocltc.amd.com/reviews/r/12253

Affected files ...

... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palmemory.cpp#11 edit
2017-02-04 00:59:54 -05:00

1301 خطوط
41 KiB
C++

//! Implementation of GPU device memory management
#include "top.hpp"
#include "thread/thread.hpp"
#include "thread/monitor.hpp"
#include "device/device.hpp"
#include "device/pal/paldevice.hpp"
#include "device/pal/palblit.hpp"
#ifdef _WIN32
#include <d3d10_1.h>
#include "amdocl/cl_d3d9_amd.hpp"
#include "amdocl/cl_d3d10_amd.hpp"
#include "amdocl/cl_d3d11_amd.hpp"
#endif //_WIN32
#include "amdocl/cl_gl_amd.hpp"
#include <string>
#include <fstream>
#include <sstream>
#include <iostream>
namespace pal {
Memory::Memory(
const Device& gpuDev,
amd::Memory& owner,
size_t size)
: device::Memory(owner)
, Resource(gpuDev, size)
{
init();
if (owner.parent() != nullptr) {
flags_ |= SubMemoryObject;
}
}
Memory::Memory(
const Device& gpuDev,
size_t size)
: device::Memory(size)
, Resource(gpuDev, size)
{
init();
}
Memory::Memory(
const Device& gpuDev,
amd::Memory& owner,
size_t width,
size_t height,
size_t depth,
cl_image_format format,
cl_mem_object_type imageType,
uint mipLevels
)
: device::Memory(owner)
, Resource(gpuDev, width, height, depth, format, imageType, mipLevels)
{
init();
if (owner.parent() != nullptr) {
flags_ |= SubMemoryObject;
}
}
Memory::Memory(
const Device& gpuDev,
size_t size,
size_t width,
size_t height,
size_t depth,
cl_image_format format,
cl_mem_object_type imageType,
uint mipLevels
)
: device::Memory(size)
, Resource(gpuDev, width, height, depth, format, imageType, mipLevels)
{
init();
}
void
Memory::init()
{
indirectMapCount_ = 0;
interopType_ = InteropNone;
interopMemory_ = nullptr;
pinnedMemory_ = nullptr;
parent_ = nullptr;
}
#ifdef _WIN32
static HANDLE
getSharedHandle(IUnknown* pIface)
{
// Sanity checks
assert(pIface != nullptr);
HRESULT hRes;
HANDLE hShared;
IDXGIResource* pDxgiRes = nullptr;
if((hRes = (const_cast<IUnknown*>(pIface))->QueryInterface(
__uuidof(IDXGIResource),
(void**) &pDxgiRes)) != S_OK) {
return (HANDLE) 0;
}
if(!pDxgiRes) {
return (HANDLE) 0;
}
hRes = pDxgiRes->GetSharedHandle(&hShared);
pDxgiRes->Release();
if(hRes != S_OK) {
return (HANDLE) 0;
}
return hShared;
}
#endif //_WIN32
bool
Memory::create(
Resource::MemoryType memType,
Resource::CreateParams* params)
{
bool result;
uint allocAttempt = 0;
// Reset the flag in case we reallocate the heap in local/remote
flags_ &= ~HostMemoryDirectAccess;
do {
// Create a resource in CAL
result = Resource::create(memType, params);
if (!result) {
size_t freeMemory[2];
// if requested memory is greater than available then exit the loop
dev().globalFreeMemory(freeMemory);
// Local to Persistent
if (memoryType() == Local) {
// For dgpu freeMemory[0] reports a sum of visible+invisible fb
if (size() > (freeMemory[0] * Ki)) {
break;
}
memType = Persistent;
}
// Don't switch to USWC if persistent memory was explicitly asked
else if ((allocAttempt > 0) && (memoryType() == Persistent)) {
memType = RemoteUSWC;
}
// Remote cacheable to uncacheable
else if (memoryType() == Remote) {
memType = RemoteUSWC;
}
else if (dev().settings().apuSystem_ && memoryType() == RemoteUSWC) {
if (size() > (freeMemory[0] * Ki)) {
break;
}
}
else {
break;
}
allocAttempt++;
}
} while (!result);
// Check if CAL created a resource
if (result) {
switch (memoryType()) {
case Resource::Pinned:
// Marks memory object for direct GPU access to the host memory
flags_ |= HostMemoryDirectAccess;
break;
case Resource::Remote:
case Resource::RemoteUSWC:
if (!desc().tiled_) {
// Marks memory object for direct GPU access to the host memory
flags_ |= HostMemoryDirectAccess;
}
break;
case Resource::View: {
Resource::ViewParams* view =
reinterpret_cast<Resource::ViewParams*>(params);
// Check if parent was allocated in system memory
if ((view->resource_->memoryType() == Resource::Pinned) ||
(view->resource_->memoryType() == Resource::Remote) ||
(view->resource_->memoryType() == Resource::RemoteUSWC)) {
// Marks memory object for direct GPU access to the host memory
flags_ |= HostMemoryDirectAccess;
}
if ((view->owner_ != nullptr) && (view->owner_->parent() != nullptr)) {
parent_ = reinterpret_cast<const Memory*>(view->memory_);
flags_ |= SubMemoryObject;
}
break;
}
case Resource::ImageView: {
Resource::ImageViewParams* view =
reinterpret_cast<Resource::ImageViewParams*>(params);
parent_ = reinterpret_cast<const Memory*>(view->memory_);
flags_ |= SubMemoryObject | (parent_->flags_ & HostMemoryDirectAccess);
break;
}
case Resource::ImageBuffer: {
Resource::ImageBufferParams* view =
reinterpret_cast<Resource::ImageBufferParams*>(params);
parent_ = reinterpret_cast<const Memory*>(view->memory_);
flags_ |= SubMemoryObject | (parent_->flags_ & HostMemoryDirectAccess);
break;
}
default:
break;
}
}
return result;
}
bool Memory::processGLResource(GLResourceOP operation)
{
bool retVal = false;
switch (operation)
{
case GLDecompressResource:
retVal = glAcquire();
break;
case GLInvalidateFBO:
retVal = glRelease();
break;
default:
assert(false && "unknown GLResourceOP");
}
return retVal;
}
bool
Memory::createInterop(InteropType type)
{
Resource::MemoryType memType = Resource::Empty;
Resource::OGLInteropParams oglRes;
#ifdef _WIN32
Resource::D3DInteropParams d3dRes;
#endif //_WIN32
// Only external objects support interop
assert(owner() != nullptr);
Resource::CreateParams* createParams = nullptr;
amd::InteropObject* interop = owner()->getInteropObj();
assert((interop != nullptr) && "An invalid interop object is impossible!");
amd::GLObject* glObject = interop->asGLObject();
#ifdef _WIN32
amd::D3D10Object* d3d10Object = interop->asD3D10Object();
amd::D3D11Object* d3d11Object = interop->asD3D11Object();
amd::D3D9Object* d3d9Object = interop->asD3D9Object();
if (d3d10Object != nullptr) {
createParams = &d3dRes;
d3dRes.owner_ = owner();
const amd::D3D10ObjDesc_t* objDesc = d3d10Object->getObjDesc();
memType = Resource::D3D10Interop;
// Get shared handle
if ((d3dRes.handle_ =
getSharedHandle(d3d10Object->getD3D10Resource()))) {
d3dRes.iDirect3D_ = static_cast<void*>
(d3d10Object->getD3D10Resource());
d3dRes.type_ = Resource::InteropTypeless;
}
d3dRes.misc = 0;
// Find D3D10 object type
switch (objDesc->objDim_) {
case D3D10_RESOURCE_DIMENSION_BUFFER:
d3dRes.type_ = Resource::InteropVertexBuffer;
break;
case D3D10_RESOURCE_DIMENSION_TEXTURE1D:
case D3D10_RESOURCE_DIMENSION_TEXTURE2D:
case D3D10_RESOURCE_DIMENSION_TEXTURE3D:
d3dRes.type_ = Resource::InteropTexture;
if (objDesc->mipLevels_ > 1) {
d3dRes.type_ = Resource::InteropTextureViewLevel;
if (objDesc->arraySize_ > 1) {
d3dRes.layer_ = d3d10Object->getSubresource() /
objDesc->mipLevels_;
d3dRes.mipLevel_ = d3d10Object->getSubresource() %
objDesc->mipLevels_;
}
else {
d3dRes.layer_ = 0;
d3dRes.mipLevel_ = d3d10Object->getSubresource();
}
}
break;
default:
return false;
break;
}
}
else if (d3d11Object != nullptr) {
createParams = &d3dRes;
d3dRes.owner_ = owner();
const amd::D3D11ObjDesc_t* objDesc = d3d11Object->getObjDesc();
memType = Resource::D3D11Interop;
// Get shared handle
if ((d3dRes.handle_ =
getSharedHandle(d3d11Object->getD3D11Resource()))) {
d3dRes.iDirect3D_ = static_cast<void*>
(d3d11Object->getD3D11Resource());
d3dRes.type_ = Resource::InteropTypeless;
}
d3dRes.misc = 0;
// Find D3D11 object type
switch (objDesc->objDim_) {
case D3D11_RESOURCE_DIMENSION_BUFFER:
d3dRes.type_ = Resource::InteropVertexBuffer;
break;
case D3D11_RESOURCE_DIMENSION_TEXTURE1D:
case D3D11_RESOURCE_DIMENSION_TEXTURE2D:
case D3D11_RESOURCE_DIMENSION_TEXTURE3D:
d3dRes.type_ = Resource::InteropTexture;
d3dRes.layer_= d3d11Object->getPlane();
d3dRes.misc = d3d11Object->getMiscFlag();
if (objDesc->mipLevels_ > 1) {
d3dRes.type_ = Resource::InteropTextureViewLevel;
if (objDesc->arraySize_ > 1) {
d3dRes.layer_ = d3d11Object->getSubresource() /
objDesc->mipLevels_;
d3dRes.mipLevel_ = d3d11Object->getSubresource() %
objDesc->mipLevels_;
}
else {
d3dRes.layer_ = 0;
d3dRes.mipLevel_ = d3d11Object->getSubresource();
}
}
break;
default:
return false;
break;
}
}
else if (d3d9Object != nullptr) {
createParams = &d3dRes;
d3dRes.owner_ = owner();
const amd::D3D9ObjDesc_t* objDesc = d3d9Object->getObjDesc();
memType = Resource::D3D9Interop;
// Get shared handle
if ((d3dRes.handle_ = d3d9Object->getD3D9SharedHandle())) {
d3dRes.iDirect3D_ = static_cast<void*>
(d3d9Object->getD3D9Resource());
d3dRes.type_ = Resource::InteropSurface;
d3dRes.mipLevel_ = 0;
d3dRes.layer_ = d3d9Object->getPlane();
d3dRes.misc = d3d9Object->getMiscFlag();
}
}
else
#endif //_WIN32
if (glObject != nullptr) {
createParams = &oglRes;
oglRes.owner_ = owner();
memType = Resource::OGLInterop;
// Fill the interop creation parameters
oglRes.handle_ = static_cast<uint>(glObject->getGLName());
// Find OGL object type
switch (glObject->getCLGLObjectType()) {
case CL_GL_OBJECT_BUFFER:
oglRes.type_ = Resource::InteropVertexBuffer;
break;
case CL_GL_OBJECT_TEXTURE_BUFFER:
case CL_GL_OBJECT_TEXTURE1D:
case CL_GL_OBJECT_TEXTURE1D_ARRAY:
case CL_GL_OBJECT_TEXTURE2D:
case CL_GL_OBJECT_TEXTURE2D_ARRAY:
case CL_GL_OBJECT_TEXTURE3D:
oglRes.type_ = Resource::InteropTexture;
if (GL_TEXTURE_CUBE_MAP == glObject->getGLTarget()) {
switch (glObject->getCubemapFace()) {
case GL_TEXTURE_CUBE_MAP_POSITIVE_X:
case GL_TEXTURE_CUBE_MAP_NEGATIVE_X:
case GL_TEXTURE_CUBE_MAP_POSITIVE_Y:
case GL_TEXTURE_CUBE_MAP_NEGATIVE_Y:
case GL_TEXTURE_CUBE_MAP_POSITIVE_Z:
case GL_TEXTURE_CUBE_MAP_NEGATIVE_Z:
oglRes.type_ = Resource::InteropTextureViewCube;
oglRes.layer_ =
glObject->getCubemapFace() - GL_TEXTURE_CUBE_MAP_POSITIVE_X;
oglRes.mipLevel_ = glObject->getGLMipLevel();
break;
default:
break;
}
}
else if (glObject->getGLMipLevel() != 0) {
oglRes.type_ = Resource::InteropTextureViewLevel;
oglRes.layer_ = 0;
oglRes.mipLevel_ = glObject->getGLMipLevel();
}
break;
case CL_GL_OBJECT_RENDERBUFFER:
oglRes.type_ = Resource::InteropRenderBuffer;
break;
default:
return false;
break;
}
oglRes.glPlatformContext_ = owner()->getContext().info().hCtx_;
oglRes.glDeviceContext_ = owner()->getContext().info().hDev_[amd::Context::DeviceFlagIdx::GLDeviceKhrIdx];
// We dont pass any flags here for the GL Resource.
oglRes.flags_ = 0;
}
else {
return false;
}
// Get the interop settings
if (type == InteropDirectAccess) {
// Create memory object
if (!create(memType, createParams)) {
return false;
}
}
else {
// Allocate Resource object for interop as buffer
interopMemory_ = new Memory(dev(), size());
// Create the interop object in CAL
if (nullptr == interopMemory_ || !interopMemory_->create(memType, createParams)) {
delete interopMemory_;
interopMemory_ = nullptr;
return false;
}
}
setInteropType(type);
return true;
}
Memory::~Memory()
{
// Clean VA cache
dev().removeVACache(this);
delete interopMemory_;
// Release associated map target, if any
if (nullptr != mapMemory_) {
mapMemory()->unmap(nullptr);
mapMemory_->release();
}
// Destory pinned memory
if (flags_ & PinnedMemoryAlloced) {
delete pinnedMemory_;
}
if ((owner() != nullptr) && isHostMemDirectAccess() &&
!(flags_ & SubMemoryObject) &&
(memoryType() != Resource::ExternalPhysical)) {
// Unmap memory if direct access was requested
unmap(nullptr);
}
}
void
Memory::syncCacheFromHost(VirtualGPU& gpu, device::Memory::SyncFlags syncFlags)
{
// If the last writer was another GPU, then make a writeback
if (!isHostMemDirectAccess() &&
(owner()->getLastWriter() != nullptr) &&
(&dev() != owner()->getLastWriter())) {
mgpuCacheWriteBack();
}
// If host memory doesn't have direct access, then we have to synchronize
if (!isHostMemDirectAccess() && (nullptr != owner()->getHostMem())) {
bool hasUpdates = true;
// Make sure the parent of subbuffer is up to date
if (!syncFlags.skipParent_ && (flags_ & SubMemoryObject)) {
pal::Memory* gpuMemory = dev().getGpuMemory(owner()->parent());
//! \note: Skipping the sync for a view doesn't reflect the parent settings,
//! since a view is a small portion of parent
device::Memory::SyncFlags syncFlagsTmp;
// Sync parent from a view, so views have to be skipped
syncFlagsTmp.skipViews_ = true;
// Make sure the parent sync is an unique operation.
// If the app uses multiple subbuffers from multiple queues,
// then the parent sync can be called from multiple threads
amd::ScopedLock lock(owner()->parent()->lockMemoryOps());
gpuMemory->syncCacheFromHost(gpu, syncFlagsTmp);
//! \note Don't do early exit here, since we still have to sync
//! this view, if the parent sync operation was a NOP.
//! If parent was synchronized, then this view sync will be a NOP
}
// Is this a NOP?
if ((version_ == owner()->getVersion()) ||
(&dev() == owner()->getLastWriter())) {
hasUpdates = false;
}
// Update all available views, since we sync the parent
if ((owner()->subBuffers().size() != 0) &&
(hasUpdates || !syncFlags.skipViews_)) {
device::Memory::SyncFlags syncFlagsTmp;
// Sync views from parent, so parent has to be skipped
syncFlagsTmp.skipParent_ = true;
if (hasUpdates) {
// Parent will be synced so update all views with a skip
syncFlagsTmp.skipEntire_ = true;
}
else {
// Passthrough the skip entire flag to the views, since
// any view is a submemory of the parent
syncFlagsTmp.skipEntire_ = syncFlags.skipEntire_;
}
amd::ScopedLock lock(owner()->lockMemoryOps());
for (auto& sub : owner()->subBuffers()) {
//! \note Don't allow subbuffer's allocation in the worker thread.
//! It may cause a system lock, because possible resource
//! destruction, heap reallocation or subbuffer allocation
static const bool AllocSubBuffer = false;
device::Memory* devSub =
sub->getDeviceMemory(dev(), AllocSubBuffer);
if (nullptr != devSub) {
pal::Memory* gpuSub = reinterpret_cast<pal::Memory*>(devSub);
gpuSub->syncCacheFromHost(gpu, syncFlagsTmp);
}
}
}
// Make sure we didn't have a NOP,
// because this GPU device was the last writer
if (&dev() != owner()->getLastWriter()) {
// Update the latest version
version_ = owner()->getVersion();
}
// Exit if sync is a NOP or sync can be skipped
if (!hasUpdates || syncFlags.skipEntire_) {
return;
}
bool result = false;
static const bool Entire = true;
amd::Coord3D origin(0, 0, 0);
// If host memory was pinned then make a transfer
if (flags_ & PinnedMemoryAlloced) {
if (desc().buffer_) {
amd::Coord3D region(owner()->getSize());
result = gpu.blitMgr().copyBuffer(*pinnedMemory_,
*this, origin, origin, region, Entire);
}
else {
amd::Image& image = static_cast<amd::Image&>(*owner());
result = gpu.blitMgr().copyBufferToImage(*pinnedMemory_,
*this, origin, origin, image.getRegion(), Entire,
image.getRowPitch(), image.getSlicePitch());
}
}
if (!result) {
if (desc().buffer_) {
amd::Coord3D region(owner()->getSize());
result = gpu.blitMgr().writeBuffer(owner()->getHostMem(),
*this, origin, region, Entire);
}
else {
amd::Image& image = static_cast<amd::Image&>(*owner());
result = gpu.blitMgr().writeImage(owner()->getHostMem(),
*this, origin, image.getRegion(),
image.getRowPitch(), image.getSlicePitch(), Entire);
}
}
//!@todo A wait isn't really necessary. However processMemObjects()
// may lose the track of dependencies with a compute transfer(if sdma failed).
wait(gpu);
// Should never fail
assert(result && "Memory synchronization failed!");
}
}
void
Memory::syncHostFromCache(device::Memory::SyncFlags syncFlags)
{
// Sanity checks
assert(owner() != nullptr);
// If host memory doesn't have direct access, then we have to synchronize
if (!isHostMemDirectAccess()) {
bool hasUpdates = true;
// Make sure the parent of subbuffer is up to date
if (!syncFlags.skipParent_ && (flags_ & SubMemoryObject)) {
device::Memory* m = owner()->parent()->getDeviceMemory(dev());
//! \note: Skipping the sync for a view doesn't reflect the parent settings,
//! since a view is a small portion of parent
device::Memory::SyncFlags syncFlagsTmp;
// Sync parent from a view, so views have to be skipped
syncFlagsTmp.skipViews_ = true;
// Make sure the parent sync is an unique operation.
// If the app uses multiple subbuffers from multiple queues,
// then the parent sync can be called from multiple threads
amd::ScopedLock lock(owner()->parent()->lockMemoryOps());
m->syncHostFromCache(syncFlagsTmp);
//! \note Don't do early exit here, since we still have to sync
//! this view, if the parent sync operation was a NOP.
//! If parent was synchronized, then this view sync will be a NOP
}
// Is this a NOP?
if ((nullptr == owner()->getLastWriter()) ||
(version_ == owner()->getVersion())) {
hasUpdates = false;
}
// Update all available views, since we sync the parent
if ((owner()->subBuffers().size() != 0) &&
(hasUpdates || !syncFlags.skipViews_)) {
device::Memory::SyncFlags syncFlagsTmp;
// Sync views from parent, so parent has to be skipped
syncFlagsTmp.skipParent_ = true;
if (hasUpdates) {
// Parent will be synced so update all views with a skip
syncFlagsTmp.skipEntire_ = true;
}
else {
// Passthrough the skip entire flag to the views, since
// any view is a submemory of the parent
syncFlagsTmp.skipEntire_ = syncFlags.skipEntire_;
}
amd::ScopedLock lock(owner()->lockMemoryOps());
for (auto& sub : owner()->subBuffers()) {
//! \note Don't allow subbuffer's allocation in the worker thread.
//! It may cause a system lock, because possible resource
//! destruction, heap reallocation or subbuffer allocation
static const bool AllocSubBuffer = false;
device::Memory* devSub =
sub->getDeviceMemory(dev(), AllocSubBuffer);
if (nullptr != devSub) {
pal::Memory* gpuSub = reinterpret_cast<pal::Memory*>(devSub);
gpuSub->syncHostFromCache(syncFlagsTmp);
}
}
}
// Make sure we didn't have a NOP,
// because CPU was the last writer
if (nullptr != owner()->getLastWriter()) {
// Mark parent as up to date, set our version accordingly
version_ = owner()->getVersion();
}
// Exit if sync is a NOP or sync can be skipped
if (!hasUpdates || syncFlags.skipEntire_) {
return;
}
bool result = false;
static const bool Entire = true;
amd::Coord3D origin(0, 0, 0);
// If backing store was pinned then make a transfer
if (flags_ & PinnedMemoryAlloced) {
if (desc().buffer_) {
amd::Coord3D region(owner()->getSize());
result = dev().xferMgr().copyBuffer(*this,
*pinnedMemory_, origin, origin, region, Entire);
}
else {
amd::Image& image = static_cast<amd::Image&>(*owner());
result = dev().xferMgr().copyImageToBuffer(*this,
*pinnedMemory_, origin, origin, image.getRegion(), Entire,
image.getRowPitch(), image.getSlicePitch());
}
}
// Just do a basic host read
if (!result) {
if (desc().buffer_) {
amd::Coord3D region(owner()->getSize());
result = dev().xferMgr().readBuffer(*this,
owner()->getHostMem(), origin, region, Entire);
}
else {
amd::Image& image = static_cast<amd::Image&>(*owner());
result = dev().xferMgr().readImage(*this,
owner()->getHostMem(), origin, image.getRegion(),
image.getRowPitch(), image.getSlicePitch(), Entire);
}
}
// Should never fail
assert(result && "Memory synchronization failed!");
}
}
pal::Memory*
Memory::createBufferView(amd::Memory& subBufferOwner)
{
pal::Memory* viewMemory;
Resource::ViewParams params;
size_t offset = subBufferOwner.getOrigin();
size_t size = subBufferOwner.getSize();
// Create a memory object
viewMemory = new pal::Memory(dev(), subBufferOwner, size);
if (nullptr == viewMemory) {
return nullptr;
}
params.owner_ = &subBufferOwner;
params.gpu_ = static_cast<VirtualGPU*>(subBufferOwner.getVirtualDevice());
params.offset_ = offset;
params.size_ = size;
params.resource_ = this;
params.memory_ = this;
if (!viewMemory->create(Resource::View, &params)) {
delete viewMemory;
return nullptr;
}
// Explicitly set the host memory location,
// because the parent location could change after reallocation
if (nullptr != owner()->getHostMem()) {
subBufferOwner.setHostMem(
reinterpret_cast<char*>(owner()->getHostMem()) + offset);
}
else {
subBufferOwner.setHostMem(nullptr);
}
return viewMemory;
}
void
Memory::decIndMapCount()
{
// Map/unmap must be serialized
amd::ScopedLock lock(owner()->lockMemoryOps());
if (indirectMapCount_ == 0) {
if (!mipMapped()) {
LogError("decIndMapCount() called when indirectMapCount_ already zero");
}
return;
}
// Decrement the counter and release indirect map if it's the last op
if (--indirectMapCount_ == 0) {
if (nullptr != mapMemory_) {
amd::Memory* memory = mapMemory_;
amd::Memory* empty = nullptr;
// Get GPU memory
Memory* gpuMemory = mapMemory();
gpuMemory->unmap(nullptr);
if (!dev().addMapTarget(memory)) {
memory->release();
}
// Map/unamp is serialized for the same memory object,
// so it's safe to clear the pointer
assert((mapMemory_ != nullptr) && "Mapped buffer should be valid");
mapMemory_ = nullptr;
}
}
}
// Note - must be called by the device under the async lock, so no spinning
// or long pauses allowed in this function.
void*
Memory::allocMapTarget(
const amd::Coord3D& origin,
const amd::Coord3D& region,
uint mapFlags,
size_t* rowPitch,
size_t* slicePitch)
{
// Sanity checks
assert(owner() != nullptr);
// Map/unmap must be serialized
amd::ScopedLock lock(owner()->lockMemoryOps());
address mapAddress = nullptr;
size_t offset = origin[0];
//For SVM implementation, we cannot use cached map. if svm space, use the svm host pointer
void *initHostPtr = owner()->getSvmPtr();
if (nullptr != initHostPtr) {
owner()->commitSvmMemory();
}
if (owner()->numDevices() > 1) {
if ((nullptr == initHostPtr) && (owner()->getHostMem() == nullptr)) {
static const bool forceAllocHostMem = true;
if (!owner()->allocHostMemory(nullptr, forceAllocHostMem)) {
return nullptr;
}
//! \note Ignore pinning result
bool ok = pinSystemMemory(owner()->getHostMem(), owner()->getSize());
}
}
incIndMapCount();
// If host memory exists, use it
if ((owner()->getHostMem() != nullptr) && isDirectMap()) {
mapAddress = reinterpret_cast<address>(owner()->getHostMem());
}
// If resource is a persistent allocation, we can use it directly
else if (isPersistentDirectMap()) {
if (nullptr == map(nullptr)) {
LogError("Could not map target persistent resource");
decIndMapCount();
return nullptr;
}
mapAddress = data();
}
// Otherwise we can use a remote resource:
else {
// Are we in range?
size_t elementCount = desc().width_;
size_t rSize = elementCount * elementSize();
if (offset >= rSize || offset + region[0] > rSize) {
LogWarning("Memory::allocMapTarget() - offset/size out of bounds");
return nullptr;
}
// Allocate a map resource if there isn't any yet
if (indirectMapCount_ == 1) {
const static bool SysMem = true;
bool failed = false;
amd::Memory* memory = nullptr;
// Search for a possible indirect resource
cl_mem_flags flag = 0;
bool canBeCached = true;
if (nullptr != initHostPtr) {
//make sure the host memory is committed already, or we have a big problem.
assert(owner()->isSvmPtrCommited() && "The host svm memory not committed yet!");
flag = CL_MEM_USE_HOST_PTR;
canBeCached = false;
}
else {
memory = dev().findMapTarget(owner()->getSize());
}
if (memory == nullptr) {
// for map target of svm buffer , we need use svm host ptr
memory = new(dev().context())
amd::Buffer(dev().context(), flag, owner()->getSize());
Memory* gpuMemory;
do {
if ((memory == nullptr) || !memory->create(initHostPtr, SysMem)) {
failed = true;
break;
}
memory->setCacheStatus(canBeCached);
gpuMemory = reinterpret_cast<Memory*>
(memory->getDeviceMemory(dev()));
// Create, Map and get the base pointer for the resource
if ((gpuMemory == nullptr) || (nullptr == gpuMemory->map(nullptr))) {
failed = true;
break;
}
}
while (false);
}
if (failed) {
if (memory != nullptr) {
memory->release();
}
decIndMapCount();
LogError("Could not map target resource");
return nullptr;
}
// Map/unamp is serialized for the same memory object,
// so it's safe to assign the new pointer
assert((mapMemory_ == nullptr) && "Mapped buffer can't be valid");
mapMemory_ = memory;
}
else {
// Did the map resource allocation fail?
if (mapMemory_ == nullptr) {
LogError("Could not map target resource");
return nullptr;
}
}
mapAddress = mapMemory()->data();
}
return mapAddress + offset;
}
bool
Memory::pinSystemMemory(void* hostPtr, size_t size)
{
bool result = false;
// If memory has a direct access already, then skip the host memory pinning
if (isHostMemDirectAccess()) {
return true;
}
// Memory was pinned already
if (flags_ & PinnedMemoryAlloced) {
return true;
}
// Allocate memory for the pinned object
pinnedMemory_ = new Memory(dev(), size);
if (pinnedMemory_ == nullptr) {
return false;
}
// Check if it's a view
if (flags_ & SubMemoryObject) {
const pal::Memory* gpuMemory;
if (owner() != nullptr) {
gpuMemory = dev().getGpuMemory(owner()->parent());
}
else {
gpuMemory = parent();
}
if (gpuMemory->flags_ & PinnedMemoryAlloced) {
Resource::ViewParams params;
params.owner_ = owner();
params.offset_ = owner()->getOrigin();
params.size_ = owner()->getSize();
params.resource_ = gpuMemory->pinnedMemory_;
params.memory_ = nullptr;
result = pinnedMemory_->create(Resource::View, &params);
}
}
else {
Resource::PinnedParams params;
// Fill resource creation parameters
params.owner_ = owner();
params.hostMemRef_ = owner()->getHostMemRef();
params.size_ = size;
// Create resource
result = pinnedMemory_->create(Resource::Pinned, &params);
}
if (!result) {
delete pinnedMemory_;
pinnedMemory_ = nullptr;
return false;
}
flags_ |= PinnedMemoryAlloced;
return true;
}
void*
Memory::cpuMap(
device::VirtualDevice& vDev, uint flags,
uint startLayer, uint numLayers,
size_t* rowPitch,
size_t* slicePitch)
{
uint resFlags = 0;
if (flags == Memory::CpuReadOnly) {
resFlags = Resource::ReadOnly;
}
else if (flags == Memory::CpuWriteOnly) {
resFlags = Resource::WriteOnly;
}
void* ptr = map(&static_cast<VirtualGPU&>(vDev), resFlags, startLayer, numLayers);
if (!desc().buffer_) {
*rowPitch = desc().pitch_ * elementSize();
*slicePitch = desc().slice_ * elementSize();
}
return ptr;
}
void
Memory::cpuUnmap(device::VirtualDevice& vDev)
{
unmap(&static_cast<VirtualGPU&>(vDev));
}
//! \note moveTo() must be called only from outside of
//! VirtualGPU submit command methods.
//! Otherwise a deadlock in lockVgpus() is possible.
//! Also the logic in this function is very specific to
//! the zero-copy functionality.
bool
Memory::moveTo(Memory& dst)
{
bool result = false;
// Make sure that all virtual devices don't process any commands
Device::ScopedLockVgpus lock(dev());
// Wait for idle on all virtual GPUs
//!@note It's enough to wait on the active queue only
for (uint idx = 0; idx < dev().vgpus().size(); ++idx) {
wait(*(dev().vgpus()[idx]));
}
static const bool Entire = true;
amd::Coord3D origin(0, 0, 0);
amd::Coord3D region(size());
// Transfer the data from old location to a new one
if (dev().xferMgr().copyBuffer(
*this, dst, origin, origin, region, Entire)) {
// Move all properties to the new object
dst.mapMemory_ = mapMemory_;
mapMemory_ = nullptr;
dst.flags_ |= flags_ & ~HostMemoryDirectAccess;
flags_ &= HostMemoryDirectAccess;
dst.indirectMapCount_ = indirectMapCount_;
indirectMapCount_ = 0;
dst.pinnedMemory_ = pinnedMemory_;
pinnedMemory_ = nullptr;
// Replace the device memory object
//! @note: current object will be destroyed
owner()->replaceDeviceMemory(&dev(), &dst);
result = true;
}
return result;
}
Memory*
Memory::mapMemory() const
{
Memory* map = nullptr;
if (nullptr != mapMemory_) {
map = reinterpret_cast<Memory*>(mapMemory_->getDeviceMemory(dev()));
}
return map;
}
void
Memory::mgpuCacheWriteBack()
{
// Lock memory object, so only one write back can occur
amd::ScopedLock lock(owner()->lockMemoryOps());
// Attempt to allocate a staging buffer if don't have any
if (owner()->getHostMem() == nullptr) {
if (nullptr != owner()->getSvmPtr()) {
owner()->commitSvmMemory();
owner()->setHostMem(owner()->getSvmPtr());
}
else {
static const bool forceAllocHostMem = true;
owner()->allocHostMemory(nullptr, forceAllocHostMem);
}
}
// Make synchronization
if (owner()->getHostMem() != nullptr) {
//! \note Ignore pinning result
bool ok = pinSystemMemory(owner()->getHostMem(), owner()->getSize());
owner()->cacheWriteBack();
}
}
Memory*
Buffer::createBufferView(amd::Memory& subBufferOwner) const
{
pal::Memory* subBuffer;
Resource::ViewParams params;
size_t offset = subBufferOwner.getOrigin();
size_t size = subBufferOwner.getSize();
// Create a memory object
subBuffer = new pal::Buffer(dev(), subBufferOwner, size);
if (nullptr == subBuffer) {
return nullptr;
}
// Allocate a view for this buffer object
params.owner_ = &subBufferOwner;
params.offset_ = offset;
params.size_ = size;
params.resource_ = this;
params.memory_ = this;
if (!subBuffer->create(Resource::View, &params)) {
delete subBuffer;
return nullptr;
}
return subBuffer;
}
void*
Image::allocMapTarget(
const amd::Coord3D& origin,
const amd::Coord3D& region,
uint mapFlags,
size_t* rowPitch,
size_t* slicePitch)
{
// Sanity checks
assert(owner() != nullptr);
bool useRemoteResource = true;
size_t slicePitchTmp = 0;
size_t height = desc().height_;
size_t depth = desc().depth_;
// Map/unmap must be serialized
amd::ScopedLock lock(owner()->lockMemoryOps());
address mapAddress = nullptr;
size_t offset = origin[0];
incIndMapCount();
// If host memory exists, use it
if ((owner()->getHostMem() != nullptr) && isDirectMap()) {
useRemoteResource = false;
mapAddress = reinterpret_cast<address>(owner()->getHostMem());
amd::Image* amdImage = owner()->asImage();
// Calculate the offset in bytes
offset *= elementSize();
// Update the row and slice pitches value
*rowPitch = (amdImage->getRowPitch() == 0) ?
(desc().width_ * elementSize()) : amdImage->getRowPitch();
slicePitchTmp = (amdImage->getSlicePitch() == 0) ?
(height * (*rowPitch)) : amdImage->getSlicePitch();
// Adjust the offset in Y and Z dimensions
offset += origin[1] * (*rowPitch);
offset += origin[2] * slicePitchTmp;
}
// If resource is a persistent allocation, we can use it directly
//! @note Even if resource is a persistent allocation,
//! runtime can't use it directly,
//! because CAL volume map doesn't work properly.
//! @todo arrays can be added for persistent lock with some CAL changes
else if (isPersistentDirectMap()) {
if (nullptr == map(nullptr)) {
useRemoteResource = true;
LogError("Could not map target persistent resource, try remote resource");
}
else {
useRemoteResource = false;
mapAddress = data();
// Calculate the offset in bytes
offset *= elementSize();
// Update the row pitch value
*rowPitch = desc().pitch_ * elementSize();
// Adjust the offset in Y dimension
offset += origin[1] * (*rowPitch);
}
}
// Otherwise we can use a remote resource:
if (useRemoteResource) {
// Calculate X offset in bytes
offset *= elementSize();
// Allocate a map resource if there isn't any yet
if (indirectMapCount_ == 1) {
const static bool SysMem = true;
bool failed = false;
amd::Memory* memory;
// Search for a possible indirect resource
memory = dev().findMapTarget(owner()->getSize());
if (memory == nullptr) {
// Allocate a new buffer to use as the map target
//! @note Allocate a 1D buffer, since CAL issues with 3D
//! Also HW doesn't support untiled images
memory = new (dev().context())
amd::Buffer(dev().context(), 0,
desc().width_ * height * depth * elementSize());
memory->setVirtualDevice(owner()->getVirtualDevice());
Memory* gpuMemory;
do {
if ((memory == nullptr) || !memory->create(nullptr, SysMem)) {
failed = true;
break;
}
gpuMemory = reinterpret_cast<Memory*>
(memory->getDeviceMemory(dev()));
// Create, Map and get the base pointer for the resource
if ((gpuMemory == nullptr) || (nullptr == gpuMemory->map(nullptr))) {
failed = true;
break;
}
}
while (false);
}
if (failed) {
if (memory != nullptr) {
memory->release();
}
decIndMapCount();
LogError("Could not map target resource");
return nullptr;
}
// Map/unamp is serialized for the same memory object,
// so it's safe to assign the new pointer
assert((mapMemory_ == nullptr) && "Mapped buffer can't be valid");
mapMemory_ = memory;
}
else {
// Did the map resource allocation fail?
if (mapMemory_ == nullptr) {
LogError("Could not map target resource");
return nullptr;
}
}
mapAddress = mapMemory()->data();
// Update the row and slice pitches value
*rowPitch = region[0] * elementSize();
if (desc().topology_ == CL_MEM_OBJECT_IMAGE1D_ARRAY) {
slicePitchTmp = *rowPitch ;
}
else {
slicePitchTmp = *rowPitch * region[1];
}
// Use start of the indirect buffer
offset = 0;
}
if (slicePitch != nullptr) {
*slicePitch = slicePitchTmp;
}
return mapAddress + offset;
}
} // namespace pal