a94fa4eabb
SWDEV-86035 - Add PAL backend to OpenCL - Fix a crash in the pipe test. Device layer can't use device blit queue directly, but requires a blit manager call, which will perform correct wait for idle sequence. Affected files ... ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palblit.cpp#3 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palblit.hpp#3 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldevice.cpp#6 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palkernel.cpp#5 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palmemory.cpp#5 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palprogram.cpp#3 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.cpp#8 edit ... //depot/stg/opencl/drivers/opencl/tests/ocltst/module/runtime/OCLRTQueue.cpp#2 edit
1260 строки
40 KiB
C++
1260 строки
40 KiB
C++
//! Implementation of GPU device memory management
|
|
|
|
#include "top.hpp"
|
|
#include "thread/thread.hpp"
|
|
#include "thread/monitor.hpp"
|
|
#include "device/device.hpp"
|
|
#include "device/pal/paldevice.hpp"
|
|
#include "device/pal/palblit.hpp"
|
|
|
|
#ifdef _WIN32
|
|
#include <d3d10_1.h>
|
|
#include "amdocl/cl_d3d9_amd.hpp"
|
|
#include "amdocl/cl_d3d10_amd.hpp"
|
|
#include "amdocl/cl_d3d11_amd.hpp"
|
|
#endif //_WIN32
|
|
#include "amdocl/cl_gl_amd.hpp"
|
|
|
|
#include <string>
|
|
#include <fstream>
|
|
#include <sstream>
|
|
#include <iostream>
|
|
|
|
namespace pal {
|
|
|
|
Memory::Memory(
|
|
const Device& gpuDev,
|
|
amd::Memory& owner,
|
|
size_t size)
|
|
: device::Memory(owner)
|
|
, Resource(gpuDev, size)
|
|
{
|
|
init();
|
|
|
|
if (owner.parent() != nullptr) {
|
|
flags_ |= SubMemoryObject;
|
|
}
|
|
}
|
|
|
|
Memory::Memory(
|
|
const Device& gpuDev,
|
|
size_t size)
|
|
: device::Memory(size)
|
|
, Resource(gpuDev, size)
|
|
{
|
|
init();
|
|
}
|
|
|
|
Memory::Memory(
|
|
const Device& gpuDev,
|
|
amd::Memory& owner,
|
|
size_t width,
|
|
size_t height,
|
|
size_t depth,
|
|
cl_image_format format,
|
|
cl_mem_object_type imageType,
|
|
uint mipLevels
|
|
)
|
|
: device::Memory(owner)
|
|
, Resource(gpuDev, width, height, depth, format, imageType, mipLevels)
|
|
{
|
|
init();
|
|
|
|
if (owner.parent() != nullptr) {
|
|
flags_ |= SubMemoryObject;
|
|
}
|
|
}
|
|
|
|
Memory::Memory(
|
|
const Device& gpuDev,
|
|
size_t size,
|
|
size_t width,
|
|
size_t height,
|
|
size_t depth,
|
|
cl_image_format format,
|
|
cl_mem_object_type imageType,
|
|
uint mipLevels
|
|
)
|
|
: device::Memory(size)
|
|
, Resource(gpuDev, width, height, depth, format, imageType, mipLevels)
|
|
{
|
|
init();
|
|
}
|
|
|
|
void
|
|
Memory::init()
|
|
{
|
|
indirectMapCount_ = 0;
|
|
interopType_ = InteropNone;
|
|
interopMemory_ = nullptr;
|
|
pinnedMemory_ = nullptr;
|
|
parent_ = nullptr;
|
|
}
|
|
|
|
#ifdef _WIN32
|
|
static HANDLE
|
|
getSharedHandle(IUnknown* pIface)
|
|
{
|
|
// Sanity checks
|
|
assert(pIface != nullptr);
|
|
|
|
HRESULT hRes;
|
|
HANDLE hShared;
|
|
IDXGIResource* pDxgiRes = nullptr;
|
|
if((hRes = (const_cast<IUnknown*>(pIface))->QueryInterface(
|
|
__uuidof(IDXGIResource),
|
|
(void**) &pDxgiRes)) != S_OK) {
|
|
return (HANDLE) 0;
|
|
}
|
|
if(!pDxgiRes) {
|
|
return (HANDLE) 0;
|
|
}
|
|
hRes = pDxgiRes->GetSharedHandle(&hShared);
|
|
pDxgiRes->Release();
|
|
if(hRes != S_OK) {
|
|
return (HANDLE) 0;
|
|
}
|
|
return hShared;
|
|
}
|
|
#endif //_WIN32
|
|
|
|
bool
|
|
Memory::create(
|
|
Resource::MemoryType memType,
|
|
Resource::CreateParams* params)
|
|
{
|
|
bool result;
|
|
|
|
// Reset the flag in case we reallocate the heap in local/remote
|
|
flags_ &= ~HostMemoryDirectAccess;
|
|
|
|
// Create a resource in CAL
|
|
result = Resource::create(memType, params);
|
|
|
|
// Check if CAL created a resource
|
|
if (result) {
|
|
switch (memoryType()) {
|
|
case Resource::Pinned:
|
|
case Resource::ExternalPhysical:
|
|
// Marks memory object for direct GPU access to the host memory
|
|
flags_ |= HostMemoryDirectAccess;
|
|
break;
|
|
case Resource::Remote:
|
|
case Resource::RemoteUSWC:
|
|
if (!desc().tiled_) {
|
|
// Marks memory object for direct GPU access to the host memory
|
|
flags_ |= HostMemoryDirectAccess;
|
|
}
|
|
break;
|
|
case Resource::View: {
|
|
Resource::ViewParams* view =
|
|
reinterpret_cast<Resource::ViewParams*>(params);
|
|
// Check if parent was allocated in system memory
|
|
if ((view->resource_->memoryType() == Resource::Pinned) ||
|
|
(view->resource_->memoryType() == Resource::Remote) ||
|
|
(view->resource_->memoryType() == Resource::RemoteUSWC)) {
|
|
// Marks memory object for direct GPU access to the host memory
|
|
flags_ |= HostMemoryDirectAccess;
|
|
}
|
|
if ((view->owner_ != nullptr) && (view->owner_->parent() != nullptr)) {
|
|
parent_ = reinterpret_cast<const Memory*>(view->memory_);
|
|
flags_ |= SubMemoryObject;
|
|
}
|
|
break;
|
|
}
|
|
case Resource::ImageView: {
|
|
Resource::ImageViewParams* view =
|
|
reinterpret_cast<Resource::ImageViewParams*>(params);
|
|
parent_ = reinterpret_cast<const Memory*>(view->memory_);
|
|
flags_ |= SubMemoryObject | (parent_->flags_ & HostMemoryDirectAccess);
|
|
break;
|
|
}
|
|
case Resource::ImageBuffer: {
|
|
Resource::ImageBufferParams* view =
|
|
reinterpret_cast<Resource::ImageBufferParams*>(params);
|
|
parent_ = reinterpret_cast<const Memory*>(view->memory_);
|
|
flags_ |= SubMemoryObject | (parent_->flags_ & HostMemoryDirectAccess);
|
|
break;
|
|
}
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
bool Memory::processGLResource(GLResourceOP operation)
|
|
{
|
|
bool retVal = false;
|
|
switch (operation)
|
|
{
|
|
case GLDecompressResource:
|
|
retVal = gslGLAcquire();
|
|
break;
|
|
case GLInvalidateFBO:
|
|
retVal = gslGLRelease();
|
|
break;
|
|
default:
|
|
assert(false && "unknown GLResourceOP");
|
|
}
|
|
return retVal;
|
|
}
|
|
|
|
bool
|
|
Memory::createInterop(InteropType type)
|
|
{
|
|
Resource::MemoryType memType = Resource::Empty;
|
|
Resource::OGLInteropParams oglRes;
|
|
#ifdef _WIN32
|
|
Resource::D3DInteropParams d3dRes;
|
|
#endif //_WIN32
|
|
|
|
// Only external objects support interop
|
|
assert(owner() != nullptr);
|
|
|
|
Resource::CreateParams* createParams = nullptr;
|
|
|
|
amd::InteropObject* interop = owner()->getInteropObj();
|
|
assert((interop != nullptr) && "An invalid interop object is impossible!");
|
|
|
|
amd::GLObject* glObject = interop->asGLObject();
|
|
#ifdef _WIN32
|
|
amd::D3D10Object* d3d10Object = interop->asD3D10Object();
|
|
amd::D3D11Object* d3d11Object = interop->asD3D11Object();
|
|
amd::D3D9Object* d3d9Object = interop->asD3D9Object();
|
|
|
|
if (d3d10Object != nullptr) {
|
|
createParams = &d3dRes;
|
|
|
|
d3dRes.owner_ = owner();
|
|
|
|
const amd::D3D10ObjDesc_t* objDesc = d3d10Object->getObjDesc();
|
|
|
|
memType = Resource::D3D10Interop;
|
|
|
|
// Get shared handle
|
|
if ((d3dRes.handle_ =
|
|
getSharedHandle(d3d10Object->getD3D10Resource()))) {
|
|
d3dRes.iDirect3D_ = static_cast<void*>
|
|
(d3d10Object->getD3D10Resource());
|
|
d3dRes.type_ = Resource::InteropTypeless;
|
|
}
|
|
|
|
d3dRes.misc = 0;
|
|
// Find D3D10 object type
|
|
switch (objDesc->objDim_) {
|
|
case D3D10_RESOURCE_DIMENSION_BUFFER:
|
|
d3dRes.type_ = Resource::InteropVertexBuffer;
|
|
break;
|
|
case D3D10_RESOURCE_DIMENSION_TEXTURE1D:
|
|
case D3D10_RESOURCE_DIMENSION_TEXTURE2D:
|
|
case D3D10_RESOURCE_DIMENSION_TEXTURE3D:
|
|
d3dRes.type_ = Resource::InteropTexture;
|
|
if (objDesc->mipLevels_ > 1) {
|
|
d3dRes.type_ = Resource::InteropTextureViewLevel;
|
|
|
|
if (objDesc->arraySize_ > 1) {
|
|
d3dRes.layer_ = d3d10Object->getSubresource() /
|
|
objDesc->mipLevels_;
|
|
d3dRes.mipLevel_ = d3d10Object->getSubresource() %
|
|
objDesc->mipLevels_;
|
|
}
|
|
else {
|
|
d3dRes.layer_ = 0;
|
|
d3dRes.mipLevel_ = d3d10Object->getSubresource();
|
|
}
|
|
}
|
|
break;
|
|
default:
|
|
return false;
|
|
break;
|
|
}
|
|
}
|
|
else if (d3d11Object != nullptr) {
|
|
createParams = &d3dRes;
|
|
|
|
d3dRes.owner_ = owner();
|
|
|
|
const amd::D3D11ObjDesc_t* objDesc = d3d11Object->getObjDesc();
|
|
|
|
memType = Resource::D3D11Interop;
|
|
|
|
// Get shared handle
|
|
if ((d3dRes.handle_ =
|
|
getSharedHandle(d3d11Object->getD3D11Resource()))) {
|
|
d3dRes.iDirect3D_ = static_cast<void*>
|
|
(d3d11Object->getD3D11Resource());
|
|
d3dRes.type_ = Resource::InteropTypeless;
|
|
}
|
|
|
|
d3dRes.misc = 0;
|
|
// Find D3D11 object type
|
|
switch (objDesc->objDim_) {
|
|
case D3D11_RESOURCE_DIMENSION_BUFFER:
|
|
d3dRes.type_ = Resource::InteropVertexBuffer;
|
|
break;
|
|
case D3D11_RESOURCE_DIMENSION_TEXTURE1D:
|
|
case D3D11_RESOURCE_DIMENSION_TEXTURE2D:
|
|
case D3D11_RESOURCE_DIMENSION_TEXTURE3D:
|
|
d3dRes.type_ = Resource::InteropTexture;
|
|
d3dRes.layer_= d3d11Object->getPlane();
|
|
d3dRes.misc = d3d11Object->getMiscFlag();
|
|
if (objDesc->mipLevels_ > 1) {
|
|
d3dRes.type_ = Resource::InteropTextureViewLevel;
|
|
|
|
if (objDesc->arraySize_ > 1) {
|
|
d3dRes.layer_ = d3d11Object->getSubresource() /
|
|
objDesc->mipLevels_;
|
|
d3dRes.mipLevel_ = d3d11Object->getSubresource() %
|
|
objDesc->mipLevels_;
|
|
}
|
|
else {
|
|
d3dRes.layer_ = 0;
|
|
d3dRes.mipLevel_ = d3d11Object->getSubresource();
|
|
}
|
|
}
|
|
break;
|
|
default:
|
|
return false;
|
|
break;
|
|
}
|
|
}
|
|
else if (d3d9Object != nullptr) {
|
|
createParams = &d3dRes;
|
|
|
|
d3dRes.owner_ = owner();
|
|
|
|
const amd::D3D9ObjDesc_t* objDesc = d3d9Object->getObjDesc();
|
|
|
|
memType = Resource::D3D9Interop;
|
|
|
|
// Get shared handle
|
|
if ((d3dRes.handle_ = d3d9Object->getD3D9SharedHandle())) {
|
|
d3dRes.iDirect3D_ = static_cast<void*>
|
|
(d3d9Object->getD3D9Resource());
|
|
d3dRes.type_ = Resource::InteropSurface;
|
|
d3dRes.mipLevel_ = 0;
|
|
d3dRes.layer_ = d3d9Object->getPlane();
|
|
d3dRes.misc = d3d9Object->getMiscFlag();
|
|
}
|
|
}
|
|
else
|
|
#endif //_WIN32
|
|
if (glObject != nullptr) {
|
|
createParams = &oglRes;
|
|
|
|
oglRes.owner_ = owner();
|
|
|
|
memType = Resource::OGLInterop;
|
|
|
|
// Fill the interop creation parameters
|
|
oglRes.handle_ = static_cast<uint>(glObject->getGLName());
|
|
|
|
// Find OGL object type
|
|
switch (glObject->getCLGLObjectType()) {
|
|
case CL_GL_OBJECT_BUFFER:
|
|
oglRes.type_ = Resource::InteropVertexBuffer;
|
|
break;
|
|
case CL_GL_OBJECT_TEXTURE_BUFFER:
|
|
case CL_GL_OBJECT_TEXTURE1D:
|
|
case CL_GL_OBJECT_TEXTURE1D_ARRAY:
|
|
case CL_GL_OBJECT_TEXTURE2D:
|
|
case CL_GL_OBJECT_TEXTURE2D_ARRAY:
|
|
case CL_GL_OBJECT_TEXTURE3D:
|
|
oglRes.type_ = Resource::InteropTexture;
|
|
if (GL_TEXTURE_CUBE_MAP == glObject->getGLTarget()) {
|
|
switch (glObject->getCubemapFace()) {
|
|
case GL_TEXTURE_CUBE_MAP_POSITIVE_X:
|
|
case GL_TEXTURE_CUBE_MAP_NEGATIVE_X:
|
|
case GL_TEXTURE_CUBE_MAP_POSITIVE_Y:
|
|
case GL_TEXTURE_CUBE_MAP_NEGATIVE_Y:
|
|
case GL_TEXTURE_CUBE_MAP_POSITIVE_Z:
|
|
case GL_TEXTURE_CUBE_MAP_NEGATIVE_Z:
|
|
oglRes.type_ = Resource::InteropTextureViewCube;
|
|
oglRes.layer_ =
|
|
glObject->getCubemapFace() - GL_TEXTURE_CUBE_MAP_POSITIVE_X;
|
|
oglRes.mipLevel_ = glObject->getGLMipLevel();
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
else if (glObject->getGLMipLevel() != 0) {
|
|
oglRes.type_ = Resource::InteropTextureViewLevel;
|
|
oglRes.layer_ = 0;
|
|
oglRes.mipLevel_ = glObject->getGLMipLevel();
|
|
}
|
|
break;
|
|
case CL_GL_OBJECT_RENDERBUFFER:
|
|
oglRes.type_ = Resource::InteropRenderBuffer;
|
|
break;
|
|
default:
|
|
return false;
|
|
break;
|
|
}
|
|
|
|
oglRes.glPlatformContext_ = owner()->getContext().info().hCtx_;
|
|
oglRes.glDeviceContext_ = owner()->getContext().info().hDev_[amd::Context::DeviceFlagIdx::GLDeviceKhrIdx];
|
|
// We dont pass any flags here for the GL Resource.
|
|
oglRes.flags_ = 0;
|
|
}
|
|
else {
|
|
return false;
|
|
}
|
|
|
|
// Get the interop settings
|
|
if (type == InteropDirectAccess) {
|
|
// Create memory object
|
|
if (!create(memType, createParams)) {
|
|
return false;
|
|
}
|
|
}
|
|
else {
|
|
// Allocate Resource object for interop as buffer
|
|
interopMemory_ = new Memory(dev(), size());
|
|
|
|
// Create the interop object in CAL
|
|
if (nullptr == interopMemory_ || !interopMemory_->create(memType, createParams)) {
|
|
delete interopMemory_;
|
|
interopMemory_ = nullptr;
|
|
return false;
|
|
}
|
|
}
|
|
|
|
setInteropType(type);
|
|
|
|
return true;
|
|
}
|
|
|
|
Memory::~Memory()
|
|
{
|
|
// Clean VA cache
|
|
dev().removeVACache(this);
|
|
|
|
delete interopMemory_;
|
|
|
|
// Release associated map target, if any
|
|
if (nullptr != mapMemory_) {
|
|
mapMemory()->unmap(nullptr);
|
|
mapMemory_->release();
|
|
}
|
|
|
|
// Destory pinned memory
|
|
if (flags_ & PinnedMemoryAlloced) {
|
|
delete pinnedMemory_;
|
|
}
|
|
|
|
if ((owner() != nullptr) && isHostMemDirectAccess() &&
|
|
!(flags_ & SubMemoryObject) &&
|
|
(memoryType() != Resource::ExternalPhysical)) {
|
|
// Unmap memory if direct access was requested
|
|
unmap(nullptr);
|
|
}
|
|
}
|
|
|
|
void
|
|
Memory::syncCacheFromHost(VirtualGPU& gpu, device::Memory::SyncFlags syncFlags)
|
|
{
|
|
// If the last writer was another GPU, then make a writeback
|
|
if (!isHostMemDirectAccess() &&
|
|
(owner()->getLastWriter() != nullptr) &&
|
|
(&dev() != owner()->getLastWriter())) {
|
|
mgpuCacheWriteBack();
|
|
}
|
|
|
|
// If host memory doesn't have direct access, then we have to synchronize
|
|
if (!isHostMemDirectAccess() && (nullptr != owner()->getHostMem())) {
|
|
bool hasUpdates = true;
|
|
|
|
// Make sure the parent of subbuffer is up to date
|
|
if (!syncFlags.skipParent_ && (flags_ & SubMemoryObject)) {
|
|
pal::Memory* gpuMemory = dev().getGpuMemory(owner()->parent());
|
|
|
|
//! \note: Skipping the sync for a view doesn't reflect the parent settings,
|
|
//! since a view is a small portion of parent
|
|
device::Memory::SyncFlags syncFlagsTmp;
|
|
|
|
// Sync parent from a view, so views have to be skipped
|
|
syncFlagsTmp.skipViews_ = true;
|
|
|
|
// Make sure the parent sync is an unique operation.
|
|
// If the app uses multiple subbuffers from multiple queues,
|
|
// then the parent sync can be called from multiple threads
|
|
amd::ScopedLock lock(owner()->parent()->lockMemoryOps());
|
|
gpuMemory->syncCacheFromHost(gpu, syncFlagsTmp);
|
|
//! \note Don't do early exit here, since we still have to sync
|
|
//! this view, if the parent sync operation was a NOP.
|
|
//! If parent was synchronized, then this view sync will be a NOP
|
|
}
|
|
|
|
// Is this a NOP?
|
|
if ((version_ == owner()->getVersion()) ||
|
|
(&dev() == owner()->getLastWriter())) {
|
|
hasUpdates = false;
|
|
}
|
|
|
|
// Update all available views, since we sync the parent
|
|
if ((owner()->subBuffers().size() != 0) &&
|
|
(hasUpdates || !syncFlags.skipViews_)) {
|
|
device::Memory::SyncFlags syncFlagsTmp;
|
|
|
|
// Sync views from parent, so parent has to be skipped
|
|
syncFlagsTmp.skipParent_ = true;
|
|
|
|
if (hasUpdates) {
|
|
// Parent will be synced so update all views with a skip
|
|
syncFlagsTmp.skipEntire_ = true;
|
|
}
|
|
else {
|
|
// Passthrough the skip entire flag to the views, since
|
|
// any view is a submemory of the parent
|
|
syncFlagsTmp.skipEntire_ = syncFlags.skipEntire_;
|
|
}
|
|
|
|
amd::ScopedLock lock(owner()->lockMemoryOps());
|
|
for (auto& sub : owner()->subBuffers()) {
|
|
//! \note Don't allow subbuffer's allocation in the worker thread.
|
|
//! It may cause a system lock, because possible resource
|
|
//! destruction, heap reallocation or subbuffer allocation
|
|
static const bool AllocSubBuffer = false;
|
|
device::Memory* devSub =
|
|
sub->getDeviceMemory(dev(), AllocSubBuffer);
|
|
if (nullptr != devSub) {
|
|
pal::Memory* gpuSub = reinterpret_cast<pal::Memory*>(devSub);
|
|
gpuSub->syncCacheFromHost(gpu, syncFlagsTmp);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Make sure we didn't have a NOP,
|
|
// because this GPU device was the last writer
|
|
if (&dev() != owner()->getLastWriter()) {
|
|
// Update the latest version
|
|
version_ = owner()->getVersion();
|
|
}
|
|
|
|
// Exit if sync is a NOP or sync can be skipped
|
|
if (!hasUpdates || syncFlags.skipEntire_) {
|
|
return;
|
|
}
|
|
|
|
bool result = false;
|
|
static const bool Entire = true;
|
|
amd::Coord3D origin(0, 0, 0);
|
|
|
|
// If host memory was pinned then make a transfer
|
|
if (flags_ & PinnedMemoryAlloced) {
|
|
if (desc().buffer_) {
|
|
amd::Coord3D region(owner()->getSize());
|
|
result = gpu.blitMgr().copyBuffer(*pinnedMemory_,
|
|
*this, origin, origin, region, Entire);
|
|
}
|
|
else {
|
|
amd::Image& image = static_cast<amd::Image&>(*owner());
|
|
result = gpu.blitMgr().copyBufferToImage(*pinnedMemory_,
|
|
*this, origin, origin, image.getRegion(), Entire,
|
|
image.getRowPitch(), image.getSlicePitch());
|
|
}
|
|
}
|
|
|
|
if (!result) {
|
|
if (desc().buffer_) {
|
|
amd::Coord3D region(owner()->getSize());
|
|
result = gpu.blitMgr().writeBuffer(owner()->getHostMem(),
|
|
*this, origin, region, Entire);
|
|
}
|
|
else {
|
|
amd::Image& image = static_cast<amd::Image&>(*owner());
|
|
result = gpu.blitMgr().writeImage(owner()->getHostMem(),
|
|
*this, origin, image.getRegion(),
|
|
image.getRowPitch(), image.getSlicePitch(), Entire);
|
|
}
|
|
}
|
|
|
|
// Should never fail
|
|
assert(result && "Memory synchronization failed!");
|
|
}
|
|
}
|
|
|
|
void
|
|
Memory::syncHostFromCache(device::Memory::SyncFlags syncFlags)
|
|
{
|
|
// Sanity checks
|
|
assert(owner() != nullptr);
|
|
|
|
// If host memory doesn't have direct access, then we have to synchronize
|
|
if (!isHostMemDirectAccess()) {
|
|
bool hasUpdates = true;
|
|
|
|
// Make sure the parent of subbuffer is up to date
|
|
if (!syncFlags.skipParent_ && (flags_ & SubMemoryObject)) {
|
|
device::Memory* m = owner()->parent()->getDeviceMemory(dev());
|
|
|
|
//! \note: Skipping the sync for a view doesn't reflect the parent settings,
|
|
//! since a view is a small portion of parent
|
|
device::Memory::SyncFlags syncFlagsTmp;
|
|
|
|
// Sync parent from a view, so views have to be skipped
|
|
syncFlagsTmp.skipViews_ = true;
|
|
|
|
// Make sure the parent sync is an unique operation.
|
|
// If the app uses multiple subbuffers from multiple queues,
|
|
// then the parent sync can be called from multiple threads
|
|
amd::ScopedLock lock(owner()->parent()->lockMemoryOps());
|
|
m->syncHostFromCache(syncFlagsTmp);
|
|
//! \note Don't do early exit here, since we still have to sync
|
|
//! this view, if the parent sync operation was a NOP.
|
|
//! If parent was synchronized, then this view sync will be a NOP
|
|
}
|
|
|
|
// Is this a NOP?
|
|
if ((nullptr == owner()->getLastWriter()) ||
|
|
(version_ == owner()->getVersion())) {
|
|
hasUpdates = false;
|
|
}
|
|
|
|
// Update all available views, since we sync the parent
|
|
if ((owner()->subBuffers().size() != 0) &&
|
|
(hasUpdates || !syncFlags.skipViews_)) {
|
|
device::Memory::SyncFlags syncFlagsTmp;
|
|
|
|
// Sync views from parent, so parent has to be skipped
|
|
syncFlagsTmp.skipParent_ = true;
|
|
|
|
if (hasUpdates) {
|
|
// Parent will be synced so update all views with a skip
|
|
syncFlagsTmp.skipEntire_ = true;
|
|
}
|
|
else {
|
|
// Passthrough the skip entire flag to the views, since
|
|
// any view is a submemory of the parent
|
|
syncFlagsTmp.skipEntire_ = syncFlags.skipEntire_;
|
|
}
|
|
|
|
amd::ScopedLock lock(owner()->lockMemoryOps());
|
|
for (auto& sub : owner()->subBuffers()) {
|
|
//! \note Don't allow subbuffer's allocation in the worker thread.
|
|
//! It may cause a system lock, because possible resource
|
|
//! destruction, heap reallocation or subbuffer allocation
|
|
static const bool AllocSubBuffer = false;
|
|
device::Memory* devSub =
|
|
sub->getDeviceMemory(dev(), AllocSubBuffer);
|
|
if (nullptr != devSub) {
|
|
pal::Memory* gpuSub = reinterpret_cast<pal::Memory*>(devSub);
|
|
gpuSub->syncHostFromCache(syncFlagsTmp);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Make sure we didn't have a NOP,
|
|
// because CPU was the last writer
|
|
if (nullptr != owner()->getLastWriter()) {
|
|
// Mark parent as up to date, set our version accordingly
|
|
version_ = owner()->getVersion();
|
|
}
|
|
|
|
// Exit if sync is a NOP or sync can be skipped
|
|
if (!hasUpdates || syncFlags.skipEntire_) {
|
|
return;
|
|
}
|
|
|
|
bool result = false;
|
|
static const bool Entire = true;
|
|
amd::Coord3D origin(0, 0, 0);
|
|
|
|
// If backing store was pinned then make a transfer
|
|
if (flags_ & PinnedMemoryAlloced) {
|
|
if (desc().buffer_) {
|
|
amd::Coord3D region(owner()->getSize());
|
|
result = dev().xferMgr().copyBuffer(*this,
|
|
*pinnedMemory_, origin, origin, region, Entire);
|
|
}
|
|
else {
|
|
amd::Image& image = static_cast<amd::Image&>(*owner());
|
|
result = dev().xferMgr().copyImageToBuffer(*this,
|
|
*pinnedMemory_, origin, origin, image.getRegion(), Entire,
|
|
image.getRowPitch(), image.getSlicePitch());
|
|
}
|
|
}
|
|
|
|
// Just do a basic host read
|
|
if (!result) {
|
|
if (desc().buffer_) {
|
|
amd::Coord3D region(owner()->getSize());
|
|
result = dev().xferMgr().readBuffer(*this,
|
|
owner()->getHostMem(), origin, region, Entire);
|
|
}
|
|
else {
|
|
amd::Image& image = static_cast<amd::Image&>(*owner());
|
|
result = dev().xferMgr().readImage(*this,
|
|
owner()->getHostMem(), origin, image.getRegion(),
|
|
image.getRowPitch(), image.getSlicePitch(), Entire);
|
|
}
|
|
}
|
|
|
|
// Should never fail
|
|
assert(result && "Memory synchronization failed!");
|
|
}
|
|
}
|
|
|
|
pal::Memory*
|
|
Memory::createBufferView(amd::Memory& subBufferOwner)
|
|
{
|
|
pal::Memory* viewMemory;
|
|
Resource::ViewParams params;
|
|
|
|
size_t offset = subBufferOwner.getOrigin();
|
|
size_t size = subBufferOwner.getSize();
|
|
|
|
// Create a memory object
|
|
viewMemory = new pal::Memory(dev(), subBufferOwner, size);
|
|
if (nullptr == viewMemory) {
|
|
return nullptr;
|
|
}
|
|
|
|
params.owner_ = &subBufferOwner;
|
|
params.gpu_ = static_cast<VirtualGPU*>(subBufferOwner.getVirtualDevice());
|
|
params.offset_ = offset;
|
|
params.size_ = size;
|
|
params.resource_ = this;
|
|
params.memory_ = this;
|
|
if (!viewMemory->create(Resource::View, ¶ms)) {
|
|
delete viewMemory;
|
|
return nullptr;
|
|
}
|
|
|
|
// Explicitly set the host memory location,
|
|
// because the parent location could change after reallocation
|
|
if (nullptr != owner()->getHostMem()) {
|
|
subBufferOwner.setHostMem(
|
|
reinterpret_cast<char*>(owner()->getHostMem()) + offset);
|
|
}
|
|
else {
|
|
subBufferOwner.setHostMem(nullptr);
|
|
}
|
|
|
|
return viewMemory;
|
|
}
|
|
|
|
void
|
|
Memory::decIndMapCount()
|
|
{
|
|
// Map/unmap must be serialized
|
|
amd::ScopedLock lock(owner()->lockMemoryOps());
|
|
|
|
if (indirectMapCount_ == 0) {
|
|
if (!mipMapped()) {
|
|
LogError("decIndMapCount() called when indirectMapCount_ already zero");
|
|
}
|
|
return;
|
|
}
|
|
|
|
// Decrement the counter and release indirect map if it's the last op
|
|
if (--indirectMapCount_ == 0) {
|
|
if (nullptr != mapMemory_) {
|
|
amd::Memory* memory = mapMemory_;
|
|
amd::Memory* empty = nullptr;
|
|
|
|
// Get GPU memory
|
|
Memory* gpuMemory = mapMemory();
|
|
gpuMemory->unmap(nullptr);
|
|
|
|
if (!dev().addMapTarget(memory)) {
|
|
memory->release();
|
|
}
|
|
|
|
// Map/unamp is serialized for the same memory object,
|
|
// so it's safe to clear the pointer
|
|
assert((mapMemory_ != nullptr) && "Mapped buffer should be valid");
|
|
mapMemory_ = nullptr;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Note - must be called by the device under the async lock, so no spinning
|
|
// or long pauses allowed in this function.
|
|
void*
|
|
Memory::allocMapTarget(
|
|
const amd::Coord3D& origin,
|
|
const amd::Coord3D& region,
|
|
uint mapFlags,
|
|
size_t* rowPitch,
|
|
size_t* slicePitch)
|
|
{
|
|
// Sanity checks
|
|
assert(owner() != nullptr);
|
|
|
|
// Map/unmap must be serialized
|
|
amd::ScopedLock lock(owner()->lockMemoryOps());
|
|
|
|
address mapAddress = nullptr;
|
|
size_t offset = origin[0];
|
|
|
|
//For SVM implementation, we cannot use cached map. if svm space, use the svm host pointer
|
|
void *initHostPtr = owner()->getSvmPtr();
|
|
if (nullptr != initHostPtr) {
|
|
owner()->commitSvmMemory();
|
|
}
|
|
|
|
if (owner()->numDevices() > 1) {
|
|
if ((nullptr == initHostPtr) && (owner()->getHostMem() == nullptr)) {
|
|
static const bool forceAllocHostMem = true;
|
|
if (!owner()->allocHostMemory(nullptr, forceAllocHostMem)) {
|
|
return nullptr;
|
|
}
|
|
}
|
|
}
|
|
|
|
incIndMapCount();
|
|
// If host memory exists, use it
|
|
if ((owner()->getHostMem() != nullptr) && isDirectMap()) {
|
|
mapAddress = reinterpret_cast<address>(owner()->getHostMem());
|
|
}
|
|
// If resource is a persistent allocation, we can use it directly
|
|
else if (isPersistentDirectMap()) {
|
|
if (nullptr == map(nullptr)) {
|
|
LogError("Could not map target persistent resource");
|
|
decIndMapCount();
|
|
return nullptr;
|
|
}
|
|
mapAddress = data();
|
|
}
|
|
// Otherwise we can use a remote resource:
|
|
else {
|
|
// Are we in range?
|
|
size_t elementCount = desc().width_;
|
|
size_t rSize = elementCount * elementSize();
|
|
if (offset >= rSize || offset + region[0] > rSize) {
|
|
LogWarning("Memory::allocMapTarget() - offset/size out of bounds");
|
|
return nullptr;
|
|
}
|
|
|
|
// Allocate a map resource if there isn't any yet
|
|
if (indirectMapCount_ == 1) {
|
|
const static bool SysMem = true;
|
|
bool failed = false;
|
|
amd::Memory* memory = nullptr;
|
|
// Search for a possible indirect resource
|
|
cl_mem_flags flag = 0;
|
|
bool canBeCached = true;
|
|
if (nullptr != initHostPtr) {
|
|
//make sure the host memory is committed already, or we have a big problem.
|
|
assert(owner()->isSvmPtrCommited() && "The host svm memory not committed yet!");
|
|
flag = CL_MEM_USE_HOST_PTR;
|
|
canBeCached = false;
|
|
}
|
|
else {
|
|
memory = dev().findMapTarget(owner()->getSize());
|
|
}
|
|
|
|
if (memory == nullptr) {
|
|
// for map target of svm buffer , we need use svm host ptr
|
|
memory = new(dev().context())
|
|
amd::Buffer(dev().context(), flag, owner()->getSize());
|
|
Memory* gpuMemory;
|
|
|
|
do {
|
|
if ((memory == nullptr) || !memory->create(initHostPtr, SysMem)) {
|
|
failed = true;
|
|
break;
|
|
}
|
|
memory->setCacheStatus(canBeCached);
|
|
|
|
gpuMemory = reinterpret_cast<Memory*>
|
|
(memory->getDeviceMemory(dev()));
|
|
|
|
// Create, Map and get the base pointer for the resource
|
|
if ((gpuMemory == nullptr) || (nullptr == gpuMemory->map(nullptr))) {
|
|
failed = true;
|
|
break;
|
|
}
|
|
}
|
|
while (false);
|
|
}
|
|
|
|
if (failed) {
|
|
if (memory != nullptr) {
|
|
memory->release();
|
|
}
|
|
decIndMapCount();
|
|
LogError("Could not map target resource");
|
|
return nullptr;
|
|
}
|
|
|
|
// Map/unamp is serialized for the same memory object,
|
|
// so it's safe to assign the new pointer
|
|
assert((mapMemory_ == nullptr) && "Mapped buffer can't be valid");
|
|
mapMemory_ = memory;
|
|
}
|
|
else {
|
|
// Did the map resource allocation fail?
|
|
if (mapMemory_ == nullptr) {
|
|
LogError("Could not map target resource");
|
|
return nullptr;
|
|
}
|
|
}
|
|
mapAddress = mapMemory()->data();
|
|
}
|
|
|
|
return mapAddress + offset;
|
|
}
|
|
|
|
bool
|
|
Memory::pinSystemMemory(void* hostPtr, size_t size)
|
|
{
|
|
bool result = false;
|
|
|
|
// If memory has a direct access already, then skip the host memory pinning
|
|
if (isHostMemDirectAccess()) {
|
|
return true;
|
|
}
|
|
|
|
// Destroy the old pinned memory if it was already allocated
|
|
if (flags_ & PinnedMemoryAlloced) {
|
|
delete pinnedMemory_;
|
|
flags_ &= ~PinnedMemoryAlloced;
|
|
}
|
|
|
|
// Allocate memory for the pinned object
|
|
pinnedMemory_ = new Memory(dev(), size);
|
|
|
|
if (pinnedMemory_ == nullptr) {
|
|
return false;
|
|
}
|
|
|
|
// Check if it's a view
|
|
if (flags_ & SubMemoryObject) {
|
|
const pal::Memory* gpuMemory;
|
|
if (owner() != nullptr) {
|
|
gpuMemory = dev().getGpuMemory(owner()->parent());
|
|
}
|
|
else {
|
|
gpuMemory = parent();
|
|
}
|
|
|
|
if (gpuMemory->flags_ & PinnedMemoryAlloced) {
|
|
Resource::ViewParams params;
|
|
params.owner_ = owner();
|
|
params.offset_ = owner()->getOrigin();
|
|
params.size_ = owner()->getSize();
|
|
params.resource_ = gpuMemory->pinnedMemory_;
|
|
params.memory_ = nullptr;
|
|
result = pinnedMemory_->create(Resource::View, ¶ms);
|
|
}
|
|
}
|
|
else {
|
|
Resource::PinnedParams params;
|
|
// Fill resource creation parameters
|
|
params.owner_ = owner();
|
|
params.hostMemRef_ = owner()->getHostMemRef();
|
|
params.size_ = size;
|
|
|
|
// Create resource
|
|
result = pinnedMemory_->create(Resource::Pinned, ¶ms);
|
|
}
|
|
|
|
if (!result) {
|
|
delete pinnedMemory_;
|
|
pinnedMemory_ = nullptr;
|
|
return false;
|
|
}
|
|
|
|
flags_ |= PinnedMemoryAlloced;
|
|
return true;
|
|
}
|
|
|
|
void*
|
|
Memory::cpuMap(
|
|
device::VirtualDevice& vDev, uint flags,
|
|
uint startLayer, uint numLayers,
|
|
size_t* rowPitch,
|
|
size_t* slicePitch)
|
|
{
|
|
uint resFlags = 0;
|
|
if (flags == Memory::CpuReadOnly) {
|
|
resFlags = Resource::ReadOnly;
|
|
}
|
|
else if (flags == Memory::CpuWriteOnly) {
|
|
resFlags = Resource::WriteOnly;
|
|
}
|
|
|
|
void* ptr = map(&static_cast<VirtualGPU&>(vDev), resFlags, startLayer, numLayers);
|
|
if (!desc().buffer_) {
|
|
*rowPitch = desc().pitch_ * elementSize();
|
|
*slicePitch = desc().slice_ * elementSize();
|
|
}
|
|
return ptr;
|
|
}
|
|
|
|
void
|
|
Memory::cpuUnmap(device::VirtualDevice& vDev)
|
|
{
|
|
unmap(&static_cast<VirtualGPU&>(vDev));
|
|
}
|
|
|
|
//! \note moveTo() must be called only from outside of
|
|
//! VirtualGPU submit command methods.
|
|
//! Otherwise a deadlock in lockVgpus() is possible.
|
|
//! Also the logic in this function is very specific to
|
|
//! the zero-copy functionality.
|
|
|
|
bool
|
|
Memory::moveTo(Memory& dst)
|
|
{
|
|
bool result = false;
|
|
|
|
// Make sure that all virtual devices don't process any commands
|
|
Device::ScopedLockVgpus lock(dev());
|
|
|
|
// Wait for idle on all virtual GPUs
|
|
//!@note It's enough to wait on the active queue only
|
|
for (uint idx = 0; idx < dev().vgpus().size(); ++idx) {
|
|
wait(*(dev().vgpus()[idx]));
|
|
}
|
|
|
|
static const bool Entire = true;
|
|
amd::Coord3D origin(0, 0, 0);
|
|
amd::Coord3D region(size());
|
|
|
|
// Transfer the data from old location to a new one
|
|
if (dev().xferMgr().copyBuffer(
|
|
*this, dst, origin, origin, region, Entire)) {
|
|
// Move all properties to the new object
|
|
dst.mapMemory_ = mapMemory_;
|
|
mapMemory_ = nullptr;
|
|
|
|
dst.flags_ |= flags_ & ~HostMemoryDirectAccess;
|
|
flags_ &= HostMemoryDirectAccess;
|
|
|
|
dst.indirectMapCount_ = indirectMapCount_;
|
|
indirectMapCount_ = 0;
|
|
|
|
dst.pinnedMemory_ = pinnedMemory_;
|
|
pinnedMemory_ = nullptr;
|
|
|
|
// Replace the device memory object
|
|
//! @note: current object will be destroyed
|
|
owner()->replaceDeviceMemory(&dev(), &dst);
|
|
result = true;
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
Memory*
|
|
Memory::mapMemory() const
|
|
{
|
|
Memory* map = nullptr;
|
|
if (nullptr != mapMemory_) {
|
|
map = reinterpret_cast<Memory*>(mapMemory_->getDeviceMemory(dev()));
|
|
}
|
|
return map;
|
|
}
|
|
|
|
void
|
|
Memory::mgpuCacheWriteBack()
|
|
{
|
|
// Lock memory object, so only one write back can occur
|
|
amd::ScopedLock lock(owner()->lockMemoryOps());
|
|
|
|
// Attempt to allocate a staging buffer if don't have any
|
|
if (owner()->getHostMem() == nullptr) {
|
|
static const bool forceAllocHostMem = true;
|
|
if (owner()->allocHostMemory(nullptr, forceAllocHostMem)) {
|
|
//! \note Ignore pinning result
|
|
bool ok = pinSystemMemory(
|
|
owner()->getHostMem(), owner()->getHostMemRef()->size());
|
|
}
|
|
}
|
|
|
|
// Make synchronization
|
|
if (owner()->getHostMem() != nullptr) {
|
|
owner()->cacheWriteBack();
|
|
}
|
|
}
|
|
|
|
Memory*
|
|
Buffer::createBufferView(amd::Memory& subBufferOwner) const
|
|
{
|
|
pal::Memory* subBuffer;
|
|
Resource::ViewParams params;
|
|
|
|
size_t offset = subBufferOwner.getOrigin();
|
|
size_t size = subBufferOwner.getSize();
|
|
|
|
// Create a memory object
|
|
subBuffer = new pal::Buffer(dev(), subBufferOwner, size);
|
|
if (nullptr == subBuffer) {
|
|
return nullptr;
|
|
}
|
|
|
|
// Allocate a view for this buffer object
|
|
params.owner_ = &subBufferOwner;
|
|
params.offset_ = offset;
|
|
params.size_ = size;
|
|
params.resource_ = this;
|
|
params.memory_ = this;
|
|
|
|
if (!subBuffer->create(Resource::View, ¶ms)) {
|
|
delete subBuffer;
|
|
return nullptr;
|
|
}
|
|
|
|
return subBuffer;
|
|
}
|
|
|
|
void*
|
|
Image::allocMapTarget(
|
|
const amd::Coord3D& origin,
|
|
const amd::Coord3D& region,
|
|
uint mapFlags,
|
|
size_t* rowPitch,
|
|
size_t* slicePitch)
|
|
{
|
|
// Sanity checks
|
|
assert(owner() != nullptr);
|
|
bool useRemoteResource = true;
|
|
size_t slicePitchTmp = 0;
|
|
size_t height = desc().height_;
|
|
size_t depth = desc().depth_;
|
|
|
|
// Map/unmap must be serialized
|
|
amd::ScopedLock lock(owner()->lockMemoryOps());
|
|
|
|
address mapAddress = nullptr;
|
|
size_t offset = origin[0];
|
|
|
|
incIndMapCount();
|
|
|
|
// If host memory exists, use it
|
|
if ((owner()->getHostMem() != nullptr) && isDirectMap()) {
|
|
useRemoteResource = false;
|
|
mapAddress = reinterpret_cast<address>(owner()->getHostMem());
|
|
amd::Image* amdImage = owner()->asImage();
|
|
|
|
// Calculate the offset in bytes
|
|
offset *= elementSize();
|
|
|
|
// Update the row and slice pitches value
|
|
*rowPitch = (amdImage->getRowPitch() == 0) ?
|
|
(desc().width_ * elementSize()) : amdImage->getRowPitch();
|
|
slicePitchTmp = (amdImage->getSlicePitch() == 0) ?
|
|
(height * (*rowPitch)) : amdImage->getSlicePitch();
|
|
|
|
// Adjust the offset in Y and Z dimensions
|
|
offset += origin[1] * (*rowPitch);
|
|
offset += origin[2] * slicePitchTmp;
|
|
}
|
|
// If resource is a persistent allocation, we can use it directly
|
|
//! @note Even if resource is a persistent allocation,
|
|
//! runtime can't use it directly,
|
|
//! because CAL volume map doesn't work properly.
|
|
//! @todo arrays can be added for persistent lock with some CAL changes
|
|
else if (isPersistentDirectMap()) {
|
|
if (nullptr == map(nullptr)) {
|
|
useRemoteResource = true;
|
|
LogError("Could not map target persistent resource, try remote resource");
|
|
}
|
|
else {
|
|
useRemoteResource = false;
|
|
mapAddress = data();
|
|
|
|
// Calculate the offset in bytes
|
|
offset *= elementSize();
|
|
|
|
// Update the row pitch value
|
|
*rowPitch = desc().pitch_ * elementSize();
|
|
|
|
// Adjust the offset in Y dimension
|
|
offset += origin[1] * (*rowPitch);
|
|
}
|
|
}
|
|
|
|
// Otherwise we can use a remote resource:
|
|
if (useRemoteResource) {
|
|
// Calculate X offset in bytes
|
|
offset *= elementSize();
|
|
|
|
// Allocate a map resource if there isn't any yet
|
|
if (indirectMapCount_ == 1) {
|
|
const static bool SysMem = true;
|
|
bool failed = false;
|
|
amd::Memory* memory;
|
|
|
|
// Search for a possible indirect resource
|
|
memory = dev().findMapTarget(owner()->getSize());
|
|
|
|
if (memory == nullptr) {
|
|
// Allocate a new buffer to use as the map target
|
|
//! @note Allocate a 1D buffer, since CAL issues with 3D
|
|
//! Also HW doesn't support untiled images
|
|
memory = new (dev().context())
|
|
amd::Buffer(dev().context(), 0,
|
|
desc().width_ * height * depth * elementSize());
|
|
memory->setVirtualDevice(owner()->getVirtualDevice());
|
|
|
|
Memory* gpuMemory;
|
|
do {
|
|
if ((memory == nullptr) || !memory->create(nullptr, SysMem)) {
|
|
failed = true;
|
|
break;
|
|
}
|
|
|
|
gpuMemory = reinterpret_cast<Memory*>
|
|
(memory->getDeviceMemory(dev()));
|
|
|
|
// Create, Map and get the base pointer for the resource
|
|
if ((gpuMemory == nullptr) || (nullptr == gpuMemory->map(nullptr))) {
|
|
failed = true;
|
|
break;
|
|
}
|
|
}
|
|
while (false);
|
|
}
|
|
|
|
if (failed) {
|
|
if (memory != nullptr) {
|
|
memory->release();
|
|
}
|
|
decIndMapCount();
|
|
LogError("Could not map target resource");
|
|
return nullptr;
|
|
}
|
|
|
|
// Map/unamp is serialized for the same memory object,
|
|
// so it's safe to assign the new pointer
|
|
assert((mapMemory_ == nullptr) && "Mapped buffer can't be valid");
|
|
mapMemory_ = memory;
|
|
}
|
|
else {
|
|
// Did the map resource allocation fail?
|
|
if (mapMemory_ == nullptr) {
|
|
LogError("Could not map target resource");
|
|
return nullptr;
|
|
}
|
|
}
|
|
|
|
mapAddress = mapMemory()->data();
|
|
|
|
// Update the row and slice pitches value
|
|
*rowPitch = region[0] * elementSize();
|
|
if (desc().topology_ == CL_MEM_OBJECT_IMAGE1D_ARRAY) {
|
|
slicePitchTmp = *rowPitch ;
|
|
}
|
|
else {
|
|
slicePitchTmp = *rowPitch * region[1];
|
|
}
|
|
// Use start of the indirect buffer
|
|
offset = 0;
|
|
}
|
|
|
|
if (slicePitch != nullptr) {
|
|
*slicePitch = slicePitchTmp;
|
|
}
|
|
|
|
return mapAddress + offset;
|
|
}
|
|
|
|
} // namespace pal
|