//! Implementation of GPU device memory management #include "top.hpp" #include "thread/thread.hpp" #include "thread/monitor.hpp" #include "device/device.hpp" #include "device/gpu/gpudevice.hpp" #include "device/gpu/gpublit.hpp" #ifdef _WIN32 #include #include "amdocl/cl_d3d9_amd.hpp" #include "amdocl/cl_d3d10_amd.hpp" #include "amdocl/cl_d3d11_amd.hpp" #endif //_WIN32 #include "amdocl/cl_gl_amd.hpp" #include #include #include #include //! Turn this on to enable sanity checks before and after every heap operation. #if DEBUG #define EXTRA_HEAP_CHECKS 1 #endif // DEBUG namespace gpu { Memory::Memory( const Device& gpuDev, amd::Memory& owner, HeapBlock* hb, size_t size) : device::Memory(owner) , Resource(gpuDev, ((hb) ? hb->size_ : size) / Heap::ElementSize, Heap::ElementType) , hb_(hb) { init(); if (NULL != hb_) hb_->setMemory(this); if (owner.parent() != NULL) { flags_ |= SubMemoryObject; } } Memory::Memory( const Device& gpuDev, HeapBlock& hb) : device::Memory(hb.size_) , Resource(gpuDev, hb.size_ / Heap::ElementSize, Heap::ElementType) , hb_(&hb) { init(); hb.setMemory(this); } Memory::Memory( const Device& gpuDev, size_t size) : device::Memory(size) , Resource(gpuDev, amd::alignUp(size, Heap::ElementSize) / Heap::ElementSize, Heap::ElementType) , hb_(NULL) { init(); } Memory::Memory( const Device& gpuDev, amd::Memory& owner, size_t width, cmSurfFmt format ) : device::Memory(owner) , Resource(gpuDev, width, format) , hb_(NULL) { init(); if (owner.parent() != NULL) { flags_ |= SubMemoryObject; } } Memory::Memory( const Device& gpuDev, size_t size, size_t width, cmSurfFmt format ) : device::Memory(size) , Resource(gpuDev, width, format) , hb_(NULL) { init(); } Memory::Memory( const Device& gpuDev, amd::Memory& owner, size_t width, size_t height, size_t depth, cmSurfFmt format, gslChannelOrder chOrder, cl_mem_object_type imageType ) : device::Memory(owner) , Resource(gpuDev, width, height, depth, format, chOrder, imageType) , hb_(NULL) { init(); if (owner.parent() != NULL) { flags_ |= SubMemoryObject; } } Memory::Memory( const Device& gpuDev, size_t size, size_t width, size_t height, size_t depth, cmSurfFmt format, gslChannelOrder chOrder, cl_mem_object_type imageType ) : device::Memory(size) , Resource(gpuDev, width, height, depth, format, chOrder, imageType) , hb_(NULL) { init(); } void Memory::init() { indirectMapCount_ = 0; interopType_ = InteropNone; interopMemory_ = NULL; pinnedMemory_ = NULL; parent_ = NULL; } #ifdef _WIN32 static HANDLE getSharedHandle(IUnknown* pIface) { // Sanity checks assert(pIface != NULL); HRESULT hRes; HANDLE hShared; IDXGIResource* pDxgiRes = NULL; if((hRes = (const_cast(pIface))->QueryInterface( __uuidof(IDXGIResource), (void**) &pDxgiRes)) != S_OK) { return (HANDLE) 0; } if(!pDxgiRes) { return (HANDLE) 0; } hRes = pDxgiRes->GetSharedHandle(&hShared); pDxgiRes->Release(); if(hRes != S_OK) { return (HANDLE) 0; } return hShared; } #endif //_WIN32 bool Memory::create( Resource::MemoryType memType, Resource::CreateParams* params) { bool result; // Reset the flag in case we reallocate the heap in local/remote flags_ &= ~HostMemoryDirectAccess; // Create a resource in CAL result = Resource::create(memType, params); // Check if CAL created a resource if (result) { switch (memoryType()) { case Resource::Pinned: case Resource::ExternalPhysical: // Marks memory object for direct GPU access to the host memory flags_ |= HostMemoryDirectAccess; break; case Resource::Remote: case Resource::RemoteUSWC: // @todo Enable unconditional optimization for remote memory if ((owner() != NULL && owner()->getMemFlags() & CL_MEM_ALLOC_HOST_PTR) || (hb() == NULL)) { if (!cal()->tiled_) { // Marks memory object for direct GPU access to the host memory flags_ |= HostMemoryDirectAccess; } } break; case Resource::View: { Resource::ViewParams* view = reinterpret_cast(params); // Check if parent was allocated in system memory if ((view->resource_->memoryType() == Resource::Pinned) || (((view->resource_->memoryType() == Resource::Remote) || (view->resource_->memoryType() == Resource::RemoteUSWC)) && // @todo Enable unconditional optimization for remote memory // Check for external allocation, to avoid the optimization // for non-VM (double copy) mode (owner() != NULL) && ((owner()->getMemFlags() & CL_MEM_ALLOC_HOST_PTR) || dev().settings().remoteAlloc_))) { // Marks memory object for direct GPU access to the host memory flags_ |= HostMemoryDirectAccess; } if ((view->owner_ != NULL) && (view->owner_->parent() != NULL)) { parent_ = reinterpret_cast(view->memory_); flags_ |= SubMemoryObject; } break; } case Resource::ImageView: { Resource::ImageViewParams* view = reinterpret_cast(params); parent_ = reinterpret_cast(view->memory_); flags_ |= SubMemoryObject; break; } case Resource::ImageBuffer: { Resource::ImageBufferParams* view = reinterpret_cast(params); parent_ = reinterpret_cast(view->memory_); flags_ |= SubMemoryObject; break; } default: break; } } return result; } bool Memory::processGLResource(GLResourceOP operation) { bool retVal = false; switch (operation) { case GLDecompressResource: retVal = gslGLAcquire(); break; case GLInvalidateFBO: retVal = gslGLRelease(); break; default: assert(false && "unknown GLResourceOP"); } return retVal; } bool Memory::createInterop(InteropType type) { Resource::MemoryType memType = Resource::Empty; Resource::OGLInteropParams oglRes; #ifdef _WIN32 Resource::D3DInteropParams d3dRes; #endif //_WIN32 // Only external objects support interop assert(owner() != NULL); Resource::CreateParams* createParams = NULL; amd::InteropObject* interop = owner()->getInteropObj(); assert((interop != NULL) && "An invalid interop object is impossible!"); amd::GLObject* glObject = interop->asGLObject(); #ifdef _WIN32 amd::D3D10Object* d3d10Object = interop->asD3D10Object(); amd::D3D11Object* d3d11Object = interop->asD3D11Object(); amd::D3D9Object* d3d9Object = interop->asD3D9Object(); if (d3d10Object != NULL) { createParams = &d3dRes; d3dRes.owner_ = owner(); const amd::D3D10ObjDesc_t* objDesc = d3d10Object->getObjDesc(); memType = Resource::D3D10Interop; // Get shared handle if ((d3dRes.handle_ = getSharedHandle(d3d10Object->getD3D10Resource()))) { d3dRes.iDirect3D_ = static_cast (d3d10Object->getD3D10Resource()); d3dRes.type_ = Resource::InteropTypeless; } d3dRes.misc = 0; // Find D3D10 object type switch (objDesc->objDim_) { case D3D10_RESOURCE_DIMENSION_BUFFER: d3dRes.type_ = Resource::InteropVertexBuffer; break; case D3D10_RESOURCE_DIMENSION_TEXTURE1D: case D3D10_RESOURCE_DIMENSION_TEXTURE2D: case D3D10_RESOURCE_DIMENSION_TEXTURE3D: d3dRes.type_ = Resource::InteropTexture; if (objDesc->mipLevels_ > 1) { d3dRes.type_ = Resource::InteropTextureViewLevel; if (objDesc->arraySize_ > 1) { d3dRes.layer_ = d3d10Object->getSubresource() / objDesc->mipLevels_; d3dRes.mipLevel_ = d3d10Object->getSubresource() % objDesc->mipLevels_; } else { d3dRes.layer_ = 0; d3dRes.mipLevel_ = d3d10Object->getSubresource(); } } break; default: return false; break; } } else if (d3d11Object != NULL) { createParams = &d3dRes; d3dRes.owner_ = owner(); const amd::D3D11ObjDesc_t* objDesc = d3d11Object->getObjDesc(); memType = Resource::D3D11Interop; // Get shared handle if ((d3dRes.handle_ = getSharedHandle(d3d11Object->getD3D11Resource()))) { d3dRes.iDirect3D_ = static_cast (d3d11Object->getD3D11Resource()); d3dRes.type_ = Resource::InteropTypeless; } d3dRes.misc = 0; // Find D3D11 object type switch (objDesc->objDim_) { case D3D11_RESOURCE_DIMENSION_BUFFER: d3dRes.type_ = Resource::InteropVertexBuffer; break; case D3D11_RESOURCE_DIMENSION_TEXTURE1D: case D3D11_RESOURCE_DIMENSION_TEXTURE2D: case D3D11_RESOURCE_DIMENSION_TEXTURE3D: d3dRes.type_ = Resource::InteropTexture; d3dRes.layer_= d3d11Object->getPlane(); d3dRes.misc = d3d11Object->getMiscFlag(); if (objDesc->mipLevels_ > 1) { d3dRes.type_ = Resource::InteropTextureViewLevel; if (objDesc->arraySize_ > 1) { d3dRes.layer_ = d3d11Object->getSubresource() / objDesc->mipLevels_; d3dRes.mipLevel_ = d3d11Object->getSubresource() % objDesc->mipLevels_; } else { d3dRes.layer_ = 0; d3dRes.mipLevel_ = d3d11Object->getSubresource(); } } break; default: return false; break; } } else if (d3d9Object != NULL) { createParams = &d3dRes; d3dRes.owner_ = owner(); const amd::D3D9ObjDesc_t* objDesc = d3d9Object->getObjDesc(); memType = Resource::D3D9Interop; // Get shared handle if ((d3dRes.handle_ = d3d9Object->getD3D9SharedHandle())) { d3dRes.iDirect3D_ = static_cast (d3d9Object->getD3D9Resource()); d3dRes.type_ = Resource::InteropSurface; d3dRes.mipLevel_ = 0; d3dRes.layer_ = d3d9Object->getPlane(); d3dRes.misc = d3d9Object->getMiscFlag(); } } else #endif //_WIN32 if (glObject != NULL) { createParams = &oglRes; oglRes.owner_ = owner(); memType = Resource::OGLInterop; // Fill the interop creation parameters oglRes.handle_ = static_cast(glObject->getGLName()); // Find OGL object type switch (glObject->getCLGLObjectType()) { case CL_GL_OBJECT_BUFFER: oglRes.type_ = Resource::InteropVertexBuffer; break; case CL_GL_OBJECT_TEXTURE_BUFFER: case CL_GL_OBJECT_TEXTURE1D: case CL_GL_OBJECT_TEXTURE1D_ARRAY: case CL_GL_OBJECT_TEXTURE2D: case CL_GL_OBJECT_TEXTURE2D_ARRAY: case CL_GL_OBJECT_TEXTURE3D: oglRes.type_ = Resource::InteropTexture; if (GL_TEXTURE_CUBE_MAP == glObject->getGLTarget()) { switch (glObject->getCubemapFace()) { case GL_TEXTURE_CUBE_MAP_POSITIVE_X: case GL_TEXTURE_CUBE_MAP_NEGATIVE_X: case GL_TEXTURE_CUBE_MAP_POSITIVE_Y: case GL_TEXTURE_CUBE_MAP_NEGATIVE_Y: case GL_TEXTURE_CUBE_MAP_POSITIVE_Z: case GL_TEXTURE_CUBE_MAP_NEGATIVE_Z: oglRes.type_ = Resource::InteropTextureViewCube; oglRes.layer_ = glObject->getCubemapFace() - GL_TEXTURE_CUBE_MAP_POSITIVE_X; oglRes.mipLevel_ = glObject->getGLMipLevel(); break; default: break; } } else if (glObject->getGLMipLevel() != 0) { oglRes.type_ = Resource::InteropTextureViewLevel; oglRes.layer_ = 0; oglRes.mipLevel_ = glObject->getGLMipLevel(); } break; case CL_GL_OBJECT_RENDERBUFFER: oglRes.type_ = Resource::InteropRenderBuffer; break; default: return false; break; } } else { return false; } oglRes.glPlatformContext_ = owner()->getContext().info().hCtx_; oglRes.glDeviceContext_ = owner()->getContext().info().hDev_; // We dont pass any flags here for the GL Resource. oglRes.flags_ = 0; // Get the interop settings if (type == InteropDirectAccess) { // Create memory object if (!create(memType, createParams)) { return false; } } else { // Allocate Resource object for interop as buffer interopMemory_ = new Memory(dev(), size(), amd::alignUp(size(), Heap::ElementSize) / Heap::ElementSize, Heap::ElementType); // Create the interop object in CAL if (NULL == interopMemory_ || !interopMemory_->create(memType, createParams)) { delete interopMemory_; interopMemory_ = NULL; return false; } } setInteropType(type); return true; } Memory::~Memory() { // Clean VA cache dev().removeVACache(this); // Release associated heap block, if any if (hb_) { // Protect heap block from simultaneous release with realloc amd::ScopedLock k(dev().lockAsyncOps()); hb_->setMemory(NULL); hb_->free(); } delete interopMemory_; // Release associated map target, if any if (NULL != mapMemory_) { mapMemory()->unmap(NULL); mapMemory_->release(); } // Destory pinned memory if (flags_ & PinnedMemoryAlloced) { delete pinnedMemory_; } if ((owner() != NULL) && isHostMemDirectAccess() && !(flags_ & SubMemoryObject) && (memoryType() != Resource::ExternalPhysical)) { // Unmap memory if direct access was requested unmap(NULL); } } bool Memory::reallocate(HeapBlock* hb, const Resource* parent) { Resource::ViewParams params; params.size_ = hb->size_; params.resource_ = parent; params.memory_ = NULL; // Check if it's a view reallocation if (NULL != hb->parent_) { // The offset inside the view is unchanged params.offset_ = Resource::offset(); // Create a new view if (Resource::create(Resource::View, ¶ms)) { hb_ = hb; return true; } } else { params.offset_ = hb->offset_; if (Resource::reallocate(¶ms)) { hb_ = hb; return true; } } return false; } void Memory::syncCacheFromHost(VirtualGPU& gpu, device::Memory::SyncFlags syncFlags) { // If the last writer was another GPU, then make a writeback if (!isHostMemDirectAccess() && (owner()->getLastWriter() != NULL) && (&dev() != owner()->getLastWriter())) { mgpuCacheWriteBack(); } // If host memory doesn't have direct access, then we have to synchronize if (!isHostMemDirectAccess() && (NULL != owner()->getHostMem())) { bool hasUpdates = true; // Make sure the parent of subbuffer is up to date if (!syncFlags.skipParent_ && (flags_ & SubMemoryObject)) { gpu::Memory* gpuMemory = dev().getGpuMemory(owner()->parent()); //! \note: Skipping the sync for a view doesn't reflect the parent settings, //! since a view is a small portion of parent device::Memory::SyncFlags syncFlagsTmp; // Sync parent from a view, so views have to be skipped syncFlagsTmp.skipViews_ = true; // Make sure the parent sync is an unique operation. // If the app uses multiple subbuffers from multiple queues, // then the parent sync can be called from multiple threads amd::ScopedLock lock(owner()->parent()->lockMemoryOps()); gpuMemory->syncCacheFromHost(gpu, syncFlagsTmp); //! \note Don't do early exit here, since we still have to sync //! this view, if the parent sync operation was a NOP. //! If parent was synchronized, then this view sync will be a NOP } // Is this a NOP? if ((version_ == owner()->getVersion()) || (&dev() == owner()->getLastWriter())) { hasUpdates = false; } // Update all available views, since we sync the parent if ((owner()->subBuffers().size() != 0) && (hasUpdates || !syncFlags.skipViews_)) { std::list::const_iterator it; device::Memory::SyncFlags syncFlagsTmp; // Sync views from parent, so parent has to be skipped syncFlagsTmp.skipParent_ = true; if (hasUpdates) { // Parent will be synced so update all views with a skip syncFlagsTmp.skipEntire_ = true; } else { // Passthrough the skip entire flag to the views, since // any view is a submemory of the parent syncFlagsTmp.skipEntire_ = syncFlags.skipEntire_; } amd::ScopedLock lock(owner()->lockMemoryOps()); for (it = owner()->subBuffers().begin(); it != owner()->subBuffers().end(); ++it) { //! \note Don't allow subbuffer's allocation in the worker thread. //! It may cause a system lock, because possible resource //! destruction, heap reallocation or subbuffer allocation static const bool AllocSubBuffer = false; device::Memory* devSub = (*it)->getDeviceMemory(dev(), AllocSubBuffer); if (NULL != devSub) { gpu::Memory* gpuSub = reinterpret_cast(devSub); gpuSub->syncCacheFromHost(gpu, syncFlagsTmp); } } } // Make sure we didn't have a NOP, // because this GPU device was the last writer if (&dev() != owner()->getLastWriter()) { // Update the latest version version_ = owner()->getVersion(); } // Exit if sync is a NOP or sync can be skipped if (!hasUpdates || syncFlags.skipEntire_) { return; } bool result = false; static const bool Entire = true; amd::Coord3D origin(0, 0, 0); // If host memory was pinned then make a transfer if (flags_ & PinnedMemoryAlloced) { if (cal()->buffer_) { amd::Coord3D region(owner()->getSize()); result = gpu.blitMgr().copyBuffer(*pinnedMemory_, *this, origin, origin, region, Entire); } else { amd::Image& image = *owner()->asImage(); result = gpu.blitMgr().copyBufferToImage(*pinnedMemory_, *this, origin, origin, image.getRegion(), Entire, image.getRowPitch(), image.getSlicePitch()); } } if (!result) { if (cal()->buffer_) { amd::Coord3D region(owner()->getSize()); result = gpu.blitMgr().writeBuffer(owner()->getHostMem(), *this, origin, region, Entire); } else { amd::Image& image = *owner()->asImage(); result = gpu.blitMgr().writeImage(owner()->getHostMem(), *this, origin, image.getRegion(), image.getRowPitch(), image.getSlicePitch(), Entire); } } //!@todo A wait isn't really necessary. However //! Linux no-VM may have extra random failures. wait(gpu); // Should never fail assert(result && "Memory synchronization failed!"); } } void Memory::syncHostFromCache(device::Memory::SyncFlags syncFlags) { // Sanity checks assert(owner() != NULL); // If host memory doesn't have direct access, then we have to synchronize if (!isHostMemDirectAccess()) { bool hasUpdates = true; // Make sure the parent of subbuffer is up to date if (!syncFlags.skipParent_ && (flags_ & SubMemoryObject)) { device::Memory* m = owner()->parent()->getDeviceMemory(dev()); //! \note: Skipping the sync for a view doesn't reflect the parent settings, //! since a view is a small portion of parent device::Memory::SyncFlags syncFlagsTmp; // Sync parent from a view, so views have to be skipped syncFlagsTmp.skipViews_ = true; // Make sure the parent sync is an unique operation. // If the app uses multiple subbuffers from multiple queues, // then the parent sync can be called from multiple threads amd::ScopedLock lock(owner()->parent()->lockMemoryOps()); m->syncHostFromCache(syncFlagsTmp); //! \note Don't do early exit here, since we still have to sync //! this view, if the parent sync operation was a NOP. //! If parent was synchronized, then this view sync will be a NOP } // Is this a NOP? if ((NULL == owner()->getLastWriter()) || (version_ == owner()->getVersion())) { hasUpdates = false; } // Update all available views, since we sync the parent if ((owner()->subBuffers().size() != 0) && (hasUpdates || !syncFlags.skipViews_)) { std::list::const_iterator it; device::Memory::SyncFlags syncFlagsTmp; // Sync views from parent, so parent has to be skipped syncFlagsTmp.skipParent_ = true; if (hasUpdates) { // Parent will be synced so update all views with a skip syncFlagsTmp.skipEntire_ = true; } else { // Passthrough the skip entire flag to the views, since // any view is a submemory of the parent syncFlagsTmp.skipEntire_ = syncFlags.skipEntire_; } amd::ScopedLock lock(owner()->lockMemoryOps()); for (it = owner()->subBuffers().begin(); it != owner()->subBuffers().end(); ++it) { //! \note Don't allow subbuffer's allocation in the worker thread. //! It may cause a system lock, because possible resource //! destruction, heap reallocation or subbuffer allocation static const bool AllocSubBuffer = false; device::Memory* devSub = (*it)->getDeviceMemory(dev(), AllocSubBuffer); if (NULL != devSub) { gpu::Memory* gpuSub = reinterpret_cast(devSub); gpuSub->syncHostFromCache(syncFlagsTmp); } } } // Make sure we didn't have a NOP, // because CPU was the last writer if (NULL != owner()->getLastWriter()) { // Mark parent as up to date, set our version accordingly version_ = owner()->getVersion(); } // Exit if sync is a NOP or sync can be skipped if (!hasUpdates || syncFlags.skipEntire_) { return; } bool result = false; static const bool Entire = true; amd::Coord3D origin(0, 0, 0); // If backing store was pinned then make a transfer if (flags_ & PinnedMemoryAlloced) { if (cal()->buffer_) { amd::Coord3D region(owner()->getSize()); result = dev().xferMgr().copyBuffer(*this, *pinnedMemory_, origin, origin, region, Entire); } else { amd::Image& image = *owner()->asImage(); result = dev().xferMgr().copyImageToBuffer(*this, *pinnedMemory_, origin, origin, image.getRegion(), Entire, image.getRowPitch(), image.getSlicePitch()); } } // Just do a basic host read if (!result) { if (cal()->buffer_) { amd::Coord3D region(owner()->getSize()); result = dev().xferMgr().readBuffer(*this, owner()->getHostMem(), origin, region, Entire); } else { amd::Image& image = *owner()->asImage(); result = dev().xferMgr().readImage(*this, owner()->getHostMem(), origin, image.getRegion(), image.getRowPitch(), image.getSlicePitch(), Entire); } } // Should never fail assert(result && "Memory synchronization failed!"); } } gpu::Memory* Memory::createBufferView(amd::Memory& subBufferOwner) { gpu::Memory* viewMemory; Resource::ViewParams params; HeapBlock* hb = NULL; size_t offset = subBufferOwner.getOrigin(); size_t size = subBufferOwner.getSize(); if (!dev().heap()->isVirtual()) { if (NULL == hb_) { LogError("HeapBlock must be initialized!"); return NULL; } hb = new HeapBlock(NULL, size, offset + hb_->offset()); if (hb == NULL) { LogError("We don't have enough video memory!"); return NULL; } amd::ScopedLock lock(owner()->lockMemoryOps()); hb_->addView(hb); } // Create a memory object viewMemory = new gpu::Memory(dev(), subBufferOwner, hb, size); if (NULL == viewMemory) { if (hb != NULL) { hb->setMemory(NULL); hb->free(); } return NULL; } params.owner_ = &subBufferOwner; params.gpu_ = static_cast(subBufferOwner.getVirtualDevice()); params.offset_ = offset; params.size_ = size; params.resource_ = this; params.memory_ = this; if (!viewMemory->create(Resource::View, ¶ms)) { delete viewMemory; return NULL; } // Explicitly set the host memory location, // because the parent location could change after reallocation if (NULL != owner()->getHostMem()) { subBufferOwner.setHostMem( reinterpret_cast(owner()->getHostMem()) + offset); } else { subBufferOwner.setHostMem(NULL); } return viewMemory; } void Memory::decIndMapCount() { // Map/unmap must be serialized amd::ScopedLock lock(owner()->lockMemoryOps()); if (indirectMapCount_ == 0) { LogError("decIndMapCount() called when indirectMapCount_ already zero"); return; } // Decrement the counter and release indirect map if it's the last op if (--indirectMapCount_ == 0) { if (NULL != mapMemory_) { amd::Memory* memory = mapMemory_; amd::Memory* empty = NULL; // Get GPU memory Memory* gpuMemory = mapMemory(); gpuMemory->unmap(NULL); if (!dev().addMapTarget(memory)) { memory->release(); } // Map/unamp is serialized for the same memory object, // so it's safe to clear the pointer assert((mapMemory_ != NULL) && "Mapped buffer should be valid"); mapMemory_ = NULL; } } } // Note - must be called by the device under the async lock, so no spinning // or long pauses allowed in this function. void* Memory::allocMapTarget( const amd::Coord3D& origin, const amd::Coord3D& region, uint mapFlags, size_t* rowPitch, size_t* slicePitch) { // Sanity checks assert(owner() != NULL); // Map/unmap must be serialized amd::ScopedLock lock(owner()->lockMemoryOps()); address mapAddress = NULL; size_t offset = origin[0]; incIndMapCount(); // If host memory exists, use it if ((owner()->getHostMem() != NULL) && isDirectMap()) { mapAddress = reinterpret_cast
(owner()->getHostMem()); } // If resource is a persistent allocation, we can use it directly else if (isPersistentDirectMap()) { if (NULL == map(NULL)) { LogError("Could not map target persistent resource"); decIndMapCount(); return NULL; } mapAddress = data(); } // Otherwise we can use a remote resource: else { // Are we in range? size_t elementCount = cal()->width_; size_t rSize = elementCount * elementSize(); if (offset >= rSize || offset + region[0] > rSize) { LogWarning("Memory::allocMapTarget() - offset/size out of bounds"); return NULL; } // Allocate a map resource if there isn't any yet if (indirectMapCount_ == 1) { const static bool SysMem = true; bool failed = false; amd::Memory* memory = NULL; // Search for a possible indirect resource //For SVM implementation, we cannot use cached map. if svm space, use the svm host pointer void *initHostPtr = owner()->getSvmPtr(); cl_mem_flags flag = 0; bool canBeCached = true; if (NULL != initHostPtr) { //make sure the host memory is committed already, or we have a big problem. assert(owner()->isSvmPtrCommited() && "The host svm memory not committed yet!"); flag = CL_MEM_USE_HOST_PTR; canBeCached = false; } else { memory = dev().findMapTarget(owner()->getSize()); } if (memory == NULL) { // for map target of svm buffer , we need use svm host ptr memory = new(owner()->getContext()) amd::Buffer(owner()->getContext(), flag, owner()->getSize()); Memory* gpuMemory; do { if ((memory == NULL) || !memory->create(initHostPtr, SysMem)) { failed = true; break; } memory->setCacheStatus(canBeCached); gpuMemory = reinterpret_cast (memory->getDeviceMemory(dev())); // Create, Map and get the base pointer for the resource if ((gpuMemory == NULL) || (NULL == gpuMemory->map(NULL))) { failed = true; break; } } while (false); } if (failed) { if (memory != NULL) { memory->release(); } decIndMapCount(); LogError("Could not map target resource"); return NULL; } // Map/unamp is serialized for the same memory object, // so it's safe to assign the new pointer assert((mapMemory_ == NULL) && "Mapped buffer can't be valid"); mapMemory_ = memory; } else { // Did the map resource allocation fail? if (mapMemory_ == NULL) { LogError("Could not map target resource"); return NULL; } } mapAddress = mapMemory()->data(); // Use start of the indirect buffer offset = 0; } return mapAddress + offset; } bool Memory::pinSystemMemory(void* hostPtr, size_t size) { bool result = false; // If memory has a direct access already, then skip the host memory pinning if (isHostMemDirectAccess()) { return true; } // Destroy the old pinned memory if it was already allocated if (flags_ & PinnedMemoryAlloced) { delete pinnedMemory_; flags_ &= ~PinnedMemoryAlloced; } // Allocate memory for the pinned object pinnedMemory_ = new Memory(dev(), size); if (pinnedMemory_ == NULL) { return false; } // Check if it's a view if (flags_ & SubMemoryObject) { const gpu::Memory* gpuMemory; if (owner() != NULL) { gpuMemory = dev().getGpuMemory(owner()->parent()); } else { gpuMemory = parent(); } if (gpuMemory->flags_ & PinnedMemoryAlloced) { Resource::ViewParams params; params.owner_ = owner(); params.offset_ = owner()->getOrigin(); params.size_ = owner()->getSize(); params.resource_ = gpuMemory->pinnedMemory_; params.memory_ = NULL; result = pinnedMemory_->create(Resource::View, ¶ms); } } else { Resource::PinnedParams params; // Fill resource creation parameters params.owner_ = owner(); params.hostMemRef_ = owner()->getHostMemRef(); params.size_ = size; // Create resource result = pinnedMemory_->create(Resource::Pinned, ¶ms); } if (!result) { delete pinnedMemory_; pinnedMemory_ = NULL; return false; } flags_ |= PinnedMemoryAlloced; return true; } void* Memory::cpuMap( device::VirtualDevice& vDev, uint flags, uint startLayer, uint numLayers, size_t* rowPitch, size_t* slicePitch) { uint resFlags = 0; if (flags == Memory::CpuReadOnly) { resFlags = Resource::ReadOnly; } else if (flags == Memory::CpuWriteOnly) { resFlags = Resource::WriteOnly; } void* ptr = map(&static_cast(vDev), resFlags, startLayer, numLayers); if (!cal()->buffer_) { *rowPitch = cal()->pitch_ * elementSize(); *slicePitch = cal()->slice_ * elementSize(); } return ptr; } void Memory::cpuUnmap(device::VirtualDevice& vDev) { unmap(&static_cast(vDev)); } //! \note moveTo() must be called only from outside of //! VirtualGPU submit command methods. //! Otherwise a deadlock in lockVgpus() is possible. //! Also the logic in this function is very specific to //! the zero-copy functionality. bool Memory::moveTo(Memory& dst) { bool result = false; // Make sure that all virtual devices don't process any commands Device::ScopedLockVgpus lock(dev()); // Wait for idle on all virtual GPUs //!@note It's enough to wait on the active queue only for (uint idx = 0; idx < dev().vgpus().size(); ++idx) { wait(*(dev().vgpus()[idx])); } static const bool Entire = true; amd::Coord3D origin(0, 0, 0); amd::Coord3D region(size()); // Transfer the data from old location to a new one if (dev().xferMgr().copyBuffer( *this, dst, origin, origin, region, Entire)) { // Move all properties to the new object dst.mapMemory_ = mapMemory_; mapMemory_ = NULL; dst.flags_ |= flags_ & ~HostMemoryDirectAccess; flags_ &= HostMemoryDirectAccess; dst.indirectMapCount_ = indirectMapCount_; indirectMapCount_ = 0; dst.pinnedMemory_ = pinnedMemory_; pinnedMemory_ = NULL; // Replace the device memory object //! @note: current object will be destroyed owner()->replaceDeviceMemory(&dev(), &dst); result = true; } return result; } Memory* Memory::mapMemory() const { Memory* map = NULL; if (NULL != mapMemory_) { map = reinterpret_cast(mapMemory_->getDeviceMemory(dev())); } return map; } void Memory::mgpuCacheWriteBack() { // Lock memory object, so only one write back can occur amd::ScopedLock lock(owner()->lockMemoryOps()); // Attempt to allocate a staging buffer if don't have any if (owner()->getHostMem() == NULL) { static const bool forceAllocHostMem = true; if (owner()->allocHostMemory(NULL, forceAllocHostMem)) { //! \note Ignore pinning result bool ok = pinSystemMemory( owner()->getHostMem(), owner()->getHostMemRef()->size()); } } // Make synchronization if (owner()->getHostMem() != NULL) { owner()->cacheWriteBack(); } } Memory* Buffer::createBufferView(amd::Memory& subBufferOwner) const { gpu::Memory* subBuffer; Resource::ViewParams params; size_t offset = subBufferOwner.getOrigin(); size_t size = subBufferOwner.getSize(); // Create a memory object subBuffer = new gpu::Buffer(dev(), subBufferOwner, size); if (NULL == subBuffer) { return NULL; } // Allocate a view for this buffer object params.owner_ = &subBufferOwner; params.offset_ = offset; params.size_ = size; params.resource_ = this; params.memory_ = this; if (!subBuffer->create(Resource::View, ¶ms)) { delete subBuffer; return NULL; } return subBuffer; } void* Image::allocMapTarget( const amd::Coord3D& origin, const amd::Coord3D& region, uint mapFlags, size_t* rowPitch, size_t* slicePitch) { // Sanity checks assert(owner() != NULL); bool useRemoteResource = true; size_t slicePitchTmp = 0; size_t height = cal()->height_; size_t depth = cal()->depth_; // Map/unmap must be serialized amd::ScopedLock lock(owner()->lockMemoryOps()); address mapAddress = NULL; size_t offset = origin[0]; incIndMapCount(); // If host memory exists, use it if ((owner()->getHostMem() != NULL) && isDirectMap()) { useRemoteResource = false; mapAddress = reinterpret_cast
(owner()->getHostMem()); amd::Image* amdImage = owner()->asImage(); // Calculate the offset in bytes offset *= elementSize(); // Update the row and slice pitches value *rowPitch = (amdImage->getRowPitch() == 0) ? (cal()->width_ * elementSize()) : amdImage->getRowPitch(); slicePitchTmp = (amdImage->getSlicePitch() == 0) ? (height * (*rowPitch)) : amdImage->getSlicePitch(); // Adjust the offset in Y and Z dimensions offset += origin[1] * (*rowPitch); offset += origin[2] * slicePitchTmp; } // If resource is a persistent allocation, we can use it directly //! @note Even if resource is a persistent allocation, //! runtime can't use it directly, //! because CAL volume map doesn't work properly. //! @todo arrays can be added for persistent lock with some CAL changes else if (isPersistentDirectMap()) { if (NULL == map(NULL)) { useRemoteResource = true; LogError("Could not map target persistent resource, try remote resource"); } else { useRemoteResource = false; mapAddress = data(); // Calculate the offset in bytes offset *= elementSize(); // Update the row pitch value *rowPitch = cal()->pitch_ * elementSize(); // Adjust the offset in Y dimension offset += origin[1] * (*rowPitch); } } // Otherwise we can use a remote resource: if (useRemoteResource) { // Calculate X offset in bytes offset *= elementSize(); // Allocate a map resource if there isn't any yet if (indirectMapCount_ == 1) { const static bool SysMem = true; bool failed = false; amd::Memory* memory; // Search for a possible indirect resource memory = dev().findMapTarget(owner()->getSize()); if (memory == NULL) { // Allocate a new buffer to use as the map target //! @note Allocate a 1D buffer, since CAL issues with 3D //! Also HW doesn't support untiled images memory = new (owner()->getContext()) amd::Buffer(owner()->getContext(), 0, cal()->width_ * height * depth * elementSize()); Memory* gpuMemory; do { if ((memory == NULL) || !memory->create(NULL, SysMem)) { failed = true; break; } gpuMemory = reinterpret_cast (memory->getDeviceMemory(dev())); // Create, Map and get the base pointer for the resource if ((gpuMemory == NULL) || (NULL == gpuMemory->map(NULL))) { failed = true; break; } } while (false); } if (failed) { if (memory != NULL) { memory->release(); } decIndMapCount(); LogError("Could not map target resource"); return NULL; } // Map/unamp is serialized for the same memory object, // so it's safe to assign the new pointer assert((mapMemory_ == NULL) && "Mapped buffer can't be valid"); mapMemory_ = memory; } else { // Did the map resource allocation fail? if (mapMemory_ == NULL) { LogError("Could not map target resource"); return NULL; } } mapAddress = mapMemory()->data(); // Update the row and slice pitches value *rowPitch = region[0] * elementSize(); if (cal()->dimension_ == GSL_MOA_TEXTURE_1D_ARRAY) { slicePitchTmp = *rowPitch ; } else { slicePitchTmp = *rowPitch * region[1]; } // Use start of the indirect buffer offset = 0; } if (slicePitch != NULL) { *slicePitch = slicePitchTmp; } return mapAddress + offset; } } // namespace gpu