diff --git a/projects/clr/rocclr/runtime/device/rocm/rocblit.cpp b/projects/clr/rocclr/runtime/device/rocm/rocblit.cpp index bcbb98ae87..db57bc622c 100644 --- a/projects/clr/rocclr/runtime/device/rocm/rocblit.cpp +++ b/projects/clr/rocclr/runtime/device/rocm/rocblit.cpp @@ -953,7 +953,7 @@ KernelBlitManager::copyBufferToImage( size_t imgSlicePitch = imgRowPitch * size[1]; if (setup_.disableCopyBufferToImage_) { - result = DmaBlitManager::copyBufferToImage( + result = HostBlitManager::copyBufferToImage( srcMemory, dstMemory, srcOrigin, dstOrigin, size, entire, rowPitch, slicePitch); synchronize(); @@ -1061,7 +1061,7 @@ KernelBlitManager::copyBufferToImageKernel( // todo ROC runtime has a problem with a view for this format (gpuMem(dstMemory).owner()->asImage()-> getImageFormat().image_channel_data_type != CL_UNORM_INT_101010)) { - dstView = createView(gpuMem(dstMemory), newFormat); + dstView = createView(gpuMem(dstMemory), newFormat, CL_MEM_WRITE_ONLY); if (dstView != NULL) { rejected = false; releaseView = true; @@ -1189,7 +1189,7 @@ KernelBlitManager::copyImageToBuffer( size_t imgSlicePitch = imgRowPitch * size[1]; if (setup_.disableCopyImageToBuffer_) { - result = HostBlitManager::copyImageToBuffer( + result = DmaBlitManager::copyImageToBuffer( srcMemory, dstMemory, srcOrigin, dstOrigin, size, entire, rowPitch, slicePitch); synchronize(); @@ -1265,7 +1265,7 @@ KernelBlitManager::copyImageToBufferKernel( // todo ROC runtime has a problem with a view for this format (gpuMem(srcMemory).owner()->asImage()-> getImageFormat().image_channel_data_type != CL_UNORM_INT_101010)) { - srcView = createView(gpuMem(srcMemory), newFormat); + srcView = createView(gpuMem(srcMemory), newFormat, CL_MEM_READ_ONLY); if (srcView != NULL) { rejected = false; releaseView = true; @@ -1417,9 +1417,9 @@ KernelBlitManager::copyImage( // Attempt to create a view if the format was rejected if (rejected) { - srcView = createView(gpuMem(srcMemory), newFormat); + srcView = createView(gpuMem(srcMemory), newFormat, CL_MEM_READ_ONLY); if (srcView != NULL) { - dstView = createView(gpuMem(dstMemory), newFormat); + dstView = createView(gpuMem(dstMemory), newFormat, CL_MEM_WRITE_ONLY); if (dstView != NULL) { rejected = false; releaseView = true; @@ -1433,7 +1433,7 @@ KernelBlitManager::copyImage( // Fall into the host path for the entire 2D copy or // if the image format was rejected if (rejected) { - result = HostBlitManager::copyImage(srcMemory, dstMemory, + result = DmaBlitManager::copyImage(srcMemory, dstMemory, srcOrigin, dstOrigin, size, entire); synchronize(); return result; @@ -1584,7 +1584,7 @@ KernelBlitManager::readImage( if (amdMemory == NULL) { // Force SW copy - result = HostBlitManager::readImage(srcMemory, dstHost, + result = DmaBlitManager::readImage(srcMemory, dstHost, origin, size, rowPitch, slicePitch, entire); synchronize(); return result; @@ -1638,7 +1638,7 @@ KernelBlitManager::writeImage( if (amdMemory == NULL) { // Force SW copy - result = HostBlitManager::writeImage( + result = DmaBlitManager::writeImage( srcHost, dstMemory, origin, size, rowPitch, slicePitch, entire); synchronize(); return result; @@ -1679,7 +1679,7 @@ KernelBlitManager::copyBufferRect( // Fall into the ROC path for rejected transfers if (setup_.disableCopyBufferRect_ || gpuMem(srcMemory).isHostMemDirectAccess() || gpuMem(dstMemory).isHostMemDirectAccess()) { - result = DmaBlitManager::copyBufferRect(srcMemory, dstMemory, + result = HostBlitManager::copyBufferRect(srcMemory, dstMemory, srcRectIn, dstRectIn, sizeIn, entire); if (result) { @@ -1819,7 +1819,7 @@ KernelBlitManager::readBuffer( if (amdMemory == NULL) { // Force SW copy - result = HostBlitManager::readBuffer( + result = DmaBlitManager::readBuffer( srcMemory, dstHost, origin, size, entire); synchronize(); return result; @@ -1875,7 +1875,7 @@ KernelBlitManager::readBufferRect( if (amdMemory == NULL) { // Force SW copy - result = HostBlitManager::readBufferRect( + result = DmaBlitManager::readBufferRect( srcMemory, dstHost, bufRect, hostRect, size, entire); synchronize(); return result; @@ -1933,7 +1933,7 @@ KernelBlitManager::writeBuffer( if (amdMemory == NULL) { // Force SW copy - result = HostBlitManager::writeBuffer( + result = DmaBlitManager::writeBuffer( srcHost, dstMemory, origin, size, entire); synchronize(); return result; @@ -2264,7 +2264,7 @@ KernelBlitManager::fillImage( } // If the image format was rejected, then attempt to create a view if (rejected) { - memView = createView(gpuMem(memory), newFormat); + memView = createView(gpuMem(memory), newFormat, CL_MEM_WRITE_ONLY); if (memView != NULL) { rejected = false; releaseView = true; @@ -2419,11 +2419,12 @@ DmaBlitManager::pinHostMemory( Memory* KernelBlitManager::createView( const Memory& parent, - const cl_image_format format) const + cl_image_format format, + cl_mem_flags flags) const { assert((parent.owner()->asBuffer() == nullptr) && "View supports images only"); - amd::Image *image = - parent.owner()->asImage()->createView(parent.owner()->getContext(), format, &gpu()); + amd::Image *image = parent.owner()->asImage()->createView( + parent.owner()->getContext(), format, &gpu(), 0, flags); if (image == NULL) { LogError("[OCL] Fail to allocate view of image object"); diff --git a/projects/clr/rocclr/runtime/device/rocm/rocblit.hpp b/projects/clr/rocclr/runtime/device/rocm/rocblit.hpp index 8891f7170c..7a22a42cb9 100644 --- a/projects/clr/rocclr/runtime/device/rocm/rocblit.hpp +++ b/projects/clr/rocclr/runtime/device/rocm/rocblit.hpp @@ -439,8 +439,9 @@ private: //! Creates a view memory object Memory* createView( - const Memory& parent, //!< Parent memory object - const cl_image_format format //!< The new format for a view + const Memory& parent, //!< Parent memory object + cl_image_format format, //!< The new format for a view + cl_mem_flags flags //!< Memory flags ) const; //! Disable copy constructor diff --git a/projects/clr/rocclr/runtime/device/rocm/rocdevice.cpp b/projects/clr/rocclr/runtime/device/rocm/rocdevice.cpp index 403c65cd6f..b9323c1a1e 100644 --- a/projects/clr/rocclr/runtime/device/rocm/rocdevice.cpp +++ b/projects/clr/rocclr/runtime/device/rocm/rocdevice.cpp @@ -1382,9 +1382,12 @@ Device::createMemory(amd::Memory &owner) const return NULL; } + // Transfer data only if OCL context has one device. + // Cache coherency layer will update data for multiple devices if (!memory->isHostMemDirectAccess() && owner.asImage() && - owner.parent() == NULL && - (owner.getMemFlags() & (CL_MEM_COPY_HOST_PTR | CL_MEM_USE_HOST_PTR))) { + (owner.parent() == nullptr) && + (owner.getMemFlags() & CL_MEM_COPY_HOST_PTR) && + (owner.getContext().devices().size() == 1)) { // To avoid recurssive call to Device::createMemory, we perform // data transfer to the view of the image. amd::Image* imageView = owner.asImage()->createView( @@ -1417,15 +1420,18 @@ Device::createMemory(amd::Memory &owner) const amd::Coord3D(0, 0, 0), imageView->getRegion(), 0, 0, true); - // Release host memory for single device, since runtime copied data - if ((owner.getMemFlags() & CL_MEM_COPY_HOST_PTR) && - (owner.getContext().devices().size() == 1)) { - owner.setHostMem(nullptr); - } + + // Release host memory, since runtime copied data + owner.setHostMem(nullptr); imageView->release(); } + // Prepin sysmem buffer for possible data synchronization between CPU and GPU + if (!memory->isHostMemDirectAccess() && (owner.getHostMem() != nullptr)) { + memory->pinSystemMemory(owner.getHostMem(), owner.getSize()); + } + if (!result) { delete memory; return NULL; diff --git a/projects/clr/rocclr/runtime/device/rocm/rocdevice.hpp b/projects/clr/rocclr/runtime/device/rocm/rocdevice.hpp index ed12234cd9..68c14069a3 100644 --- a/projects/clr/rocclr/runtime/device/rocm/rocdevice.hpp +++ b/projects/clr/rocclr/runtime/device/rocm/rocdevice.hpp @@ -411,6 +411,8 @@ public: amd::Memory* mem //!< Pointer to AMD memory object ) const; + amd::Context& context() const { return *context_; } + private: static hsa_ven_amd_loader_1_00_pfn_t amd_loader_ext_table; @@ -431,8 +433,8 @@ private: size_t gpuvm_segment_max_alloc_; size_t alloc_granularity_; static const bool offlineDevice_; - amd::Context *context_; //!< A dummy context for internal data transfer - VirtualGPU *xferQueue_; //!< Transfer queue, created on demand + amd::Context* context_; //!< A dummy context for internal data transfer + VirtualGPU* xferQueue_; //!< Transfer queue, created on demand VirtualGPU* xferQueue() const; diff --git a/projects/clr/rocclr/runtime/device/rocm/rocmemory.cpp b/projects/clr/rocclr/runtime/device/rocm/rocmemory.cpp index 26940ccc08..3d7e9986a8 100644 --- a/projects/clr/rocclr/runtime/device/rocm/rocmemory.cpp +++ b/projects/clr/rocclr/runtime/device/rocm/rocmemory.cpp @@ -29,6 +29,7 @@ Memory::Memory(const roc::Device &dev, amd::Memory &owner) , dev_(dev) , deviceMemory_(NULL) , kind_(MEMORY_KIND_NORMAL) + , pinnedMemory_(nullptr) { } @@ -37,12 +38,18 @@ Memory::Memory(const roc::Device &dev, size_t size) , dev_(dev) , deviceMemory_(NULL) , kind_(MEMORY_KIND_NORMAL) + , pinnedMemory_(nullptr) { } Memory::~Memory() { - dev_.removeVACache(this); + // Destory pinned memory + if (flags_ & PinnedMemoryAlloced) { + pinnedMemory_->release(); + } + + dev().removeVACache(this); if (nullptr != mapMemory_) { mapMemory_->release(); } @@ -55,13 +62,11 @@ Memory::allocateMapMemory(size_t allocationSize) void *mapData = NULL; - amd::Memory* mapMemory = dev_.findMapTarget(owner()->getSize()); - + amd::Memory* mapMemory = dev().findMapTarget(owner()->getSize()); if (mapMemory == nullptr) { // Create buffer object to contain the map target. - mapMemory = - new(owner()->getContext()) amd::Buffer( - owner()->getContext(), CL_MEM_ALLOC_HOST_PTR, owner()->getSize()); + mapMemory = new (dev().context()) amd::Buffer( + dev().context(), CL_MEM_ALLOC_HOST_PTR, owner()->getSize()); if ((mapMemory == NULL) || (!mapMemory->create())) { LogError("[OCL] Fail to allocate map target object"); @@ -96,7 +101,6 @@ Memory::allocMapTarget( amd::ScopedLock lock(owner()->lockMemoryOps()); incIndMapCount(); - // If the device backing storage is direct accessible, use it. if (isHostMemDirectAccess()) { if (owner()->getHostMem() != nullptr) { @@ -126,7 +130,6 @@ Memory::allocMapTarget( return NULL; } } - return reinterpret_cast
(mapMemory_->getHostMem()) + origin[0]; } @@ -144,7 +147,7 @@ Memory::decIndMapCount() // Decrement the counter and release indirect map if it's the last op if (--indirectMapCount_ == 0 && mapMemory_ != NULL) { - if (!dev_.addMapTarget(mapMemory_)) { + if (!dev().addMapTarget(mapMemory_)) { // Release the buffer object containing the map data. mapMemory_->release(); } @@ -219,11 +222,11 @@ bool Memory::createInteropBuffer(GLenum targetType, int miplevel, size_t* metada in.out_driver_data_size=0; in.out_driver_data=NULL; - if(!dev_.mesa().Export(in, out)) + if(!dev().mesa().Export(in, out)) return false; size_t size; - hsa_agent_t agent=dev_.getBackendDevice(); + hsa_agent_t agent=dev().getBackendDevice(); hsa_status_t status=hsa_amd_interop_map_buffer(1, &agent, out.dmabuf_fd, 0, &size, &deviceMemory_, metadata_size, (const void**)metadata); close(out.dmabuf_fd); @@ -244,6 +247,344 @@ void Memory::destroyInteropBuffer() deviceMemory_=NULL; } +bool +Memory::pinSystemMemory(void* hostPtr, size_t size) +{ + size_t pinAllocSize; + const static bool SysMem = true; + amd::Memory* amdMemory = nullptr; + amd::Memory* amdParent = owner()->parent(); + + // If memory has a direct access already, then skip the host memory pinning + if (isHostMemDirectAccess()) { + return true; + } + + // Memory was pinned already + if (flags_ & PinnedMemoryAlloced) { + return true; + } + + // Check if runtime allocates a parent object + if (amdParent != nullptr) { + Memory* parent = dev().getRocMemory(amdParent); + amd::Memory* amdPinned = parent->pinnedMemory_; + if (amdPinned != nullptr) { + // Create view on the parent's pinned memory + amdMemory = new (amdPinned->getContext()) amd::Buffer( + *amdPinned, 0, owner()->getOrigin(), owner()->getSize()); + if ((amdMemory != nullptr) && !amdMemory->create()) { + amdMemory->release(); + amdMemory = nullptr; + } + } + } + + if (amdMemory == nullptr) { + amdMemory = new (dev().context()) + amd::Buffer(dev().context(), CL_MEM_USE_HOST_PTR, size); + if ((amdMemory != nullptr) && !amdMemory->create(hostPtr, SysMem)) { + amdMemory->release(); + return false; + } + } + + // Get device memory for this virtual device + // @note: This will force real memory pinning + Memory* srcMemory = dev().getRocMemory(amdMemory); + + if (srcMemory == nullptr) { + // Release memory + amdMemory->release(); + return false; + } + else { + pinnedMemory_ = amdMemory; + flags_ |= PinnedMemoryAlloced; + } + + return true; +} + +void +Memory::syncCacheFromHost(VirtualGPU& gpu, device::Memory::SyncFlags syncFlags) +{ + // If the last writer was another GPU, then make a writeback + if (!isHostMemDirectAccess() && + (owner()->getLastWriter() != nullptr) && + (&dev() != owner()->getLastWriter())) { + mgpuCacheWriteBack(); + } + + // If host memory doesn't have direct access, then we have to synchronize + if (!isHostMemDirectAccess() && (nullptr != owner()->getHostMem())) { + bool hasUpdates = true; + amd::Memory* amdParent = owner()->parent(); + + // Make sure the parent of subbuffer is up to date + if (!syncFlags.skipParent_ && (amdParent != nullptr)) { + Memory* gpuMemory = dev().getRocMemory(amdParent); + + //! \note: Skipping the sync for a view doesn't reflect the parent settings, + //! since a view is a small portion of parent + device::Memory::SyncFlags syncFlagsTmp; + + // Sync parent from a view, so views have to be skipped + syncFlagsTmp.skipViews_ = true; + + // Make sure the parent sync is an unique operation. + // If the app uses multiple subbuffers from multiple queues, + // then the parent sync can be called from multiple threads + amd::ScopedLock lock(owner()->parent()->lockMemoryOps()); + gpuMemory->syncCacheFromHost(gpu, syncFlagsTmp); + //! \note Don't do early exit here, since we still have to sync + //! this view, if the parent sync operation was a NOP. + //! If parent was synchronized, then this view sync will be a NOP + } + + // Is this a NOP? + if ((version_ == owner()->getVersion()) || + (&dev() == owner()->getLastWriter())) { + hasUpdates = false; + } + + // Update all available views, since we sync the parent + if ((owner()->subBuffers().size() != 0) && + (hasUpdates || !syncFlags.skipViews_)) { + device::Memory::SyncFlags syncFlagsTmp; + + // Sync views from parent, so parent has to be skipped + syncFlagsTmp.skipParent_ = true; + + if (hasUpdates) { + // Parent will be synced so update all views with a skip + syncFlagsTmp.skipEntire_ = true; + } + else { + // Passthrough the skip entire flag to the views, since + // any view is a submemory of the parent + syncFlagsTmp.skipEntire_ = syncFlags.skipEntire_; + } + + amd::ScopedLock lock(owner()->lockMemoryOps()); + for (auto& sub : owner()->subBuffers()) { + //! \note Don't allow subbuffer's allocation in the worker thread. + //! It may cause a system lock, because possible resource + //! destruction, heap reallocation or subbuffer allocation + static const bool AllocSubBuffer = false; + device::Memory* devSub = + sub->getDeviceMemory(dev(), AllocSubBuffer); + if (nullptr != devSub) { + Memory* gpuSub = reinterpret_cast(devSub); + gpuSub->syncCacheFromHost(gpu, syncFlagsTmp); + } + } + } + + // Make sure we didn't have a NOP, + // because this GPU device was the last writer + if (&dev() != owner()->getLastWriter()) { + // Update the latest version + version_ = owner()->getVersion(); + } + + // Exit if sync is a NOP or sync can be skipped + if (!hasUpdates || syncFlags.skipEntire_) { + return; + } + + bool result = false; + static const bool Entire = true; + amd::Coord3D origin(0, 0, 0); + + // If host memory was pinned then make a transfer + if (flags_ & PinnedMemoryAlloced) { + Memory& pinned = *dev().getRocMemory(pinnedMemory_); + if (owner()->getType() == CL_MEM_OBJECT_BUFFER) { + amd::Coord3D region(owner()->getSize()); + result = gpu.blitMgr().copyBuffer(pinned, + *this, origin, origin, region, Entire); + } + else { + amd::Image& image = static_cast(*owner()); + result = gpu.blitMgr().copyBufferToImage(pinned, + *this, origin, origin, image.getRegion(), Entire, + image.getRowPitch(), image.getSlicePitch()); + } + } + + if (!result) { + if (owner()->getType() == CL_MEM_OBJECT_BUFFER) { + amd::Coord3D region(owner()->getSize()); + result = gpu.blitMgr().writeBuffer(owner()->getHostMem(), + *this, origin, region, Entire); + } + else { + amd::Image& image = static_cast(*owner()); + result = gpu.blitMgr().writeImage(owner()->getHostMem(), + *this, origin, image.getRegion(), + image.getRowPitch(), image.getSlicePitch(), Entire); + } + } + + //!@todo A wait isn't really necessary. However processMemObjects() + // may lose the track of dependencies with a compute transfer(if sdma failed). + wait(gpu); + + // Should never fail + assert(result && "Memory synchronization failed!"); + } +} + +void +Memory::syncHostFromCache(device::Memory::SyncFlags syncFlags) +{ + // Sanity checks + assert(owner() != nullptr); + + // If host memory doesn't have direct access, then we have to synchronize + if (!isHostMemDirectAccess()) { + bool hasUpdates = true; + amd::Memory* amdParent = owner()->parent(); + + // Make sure the parent of subbuffer is up to date + if (!syncFlags.skipParent_ && (amdParent != nullptr)) { + device::Memory* m = dev().getRocMemory(amdParent); + + //! \note: Skipping the sync for a view doesn't reflect the parent settings, + //! since a view is a small portion of parent + device::Memory::SyncFlags syncFlagsTmp; + + // Sync parent from a view, so views have to be skipped + syncFlagsTmp.skipViews_ = true; + + // Make sure the parent sync is an unique operation. + // If the app uses multiple subbuffers from multiple queues, + // then the parent sync can be called from multiple threads + amd::ScopedLock lock(owner()->parent()->lockMemoryOps()); + m->syncHostFromCache(syncFlagsTmp); + //! \note Don't do early exit here, since we still have to sync + //! this view, if the parent sync operation was a NOP. + //! If parent was synchronized, then this view sync will be a NOP + } + + // Is this a NOP? + if ((nullptr == owner()->getLastWriter()) || + (version_ == owner()->getVersion())) { + hasUpdates = false; + } + + // Update all available views, since we sync the parent + if ((owner()->subBuffers().size() != 0) && + (hasUpdates || !syncFlags.skipViews_)) { + device::Memory::SyncFlags syncFlagsTmp; + + // Sync views from parent, so parent has to be skipped + syncFlagsTmp.skipParent_ = true; + + if (hasUpdates) { + // Parent will be synced so update all views with a skip + syncFlagsTmp.skipEntire_ = true; + } + else { + // Passthrough the skip entire flag to the views, since + // any view is a submemory of the parent + syncFlagsTmp.skipEntire_ = syncFlags.skipEntire_; + } + + amd::ScopedLock lock(owner()->lockMemoryOps()); + for (auto& sub : owner()->subBuffers()) { + //! \note Don't allow subbuffer's allocation in the worker thread. + //! It may cause a system lock, because possible resource + //! destruction, heap reallocation or subbuffer allocation + static const bool AllocSubBuffer = false; + device::Memory* devSub = + sub->getDeviceMemory(dev(), AllocSubBuffer); + if (nullptr != devSub) { + Memory* gpuSub = reinterpret_cast(devSub); + gpuSub->syncHostFromCache(syncFlagsTmp); + } + } + } + + // Make sure we didn't have a NOP, + // because CPU was the last writer + if (nullptr != owner()->getLastWriter()) { + // Mark parent as up to date, set our version accordingly + version_ = owner()->getVersion(); + } + + // Exit if sync is a NOP or sync can be skipped + if (!hasUpdates || syncFlags.skipEntire_) { + return; + } + + bool result = false; + static const bool Entire = true; + amd::Coord3D origin(0, 0, 0); + + // If backing store was pinned then make a transfer + if (flags_ & PinnedMemoryAlloced) { + Memory& pinned = *dev().getRocMemory(pinnedMemory_); + if (owner()->getType() == CL_MEM_OBJECT_BUFFER) { + amd::Coord3D region(owner()->getSize()); + result = dev().xferMgr().copyBuffer(*this, + pinned, origin, origin, region, Entire); + } + else { + amd::Image& image = static_cast(*owner()); + result = dev().xferMgr().copyImageToBuffer(*this, + pinned, origin, origin, image.getRegion(), Entire, + image.getRowPitch(), image.getSlicePitch()); + } + } + + // Just do a basic host read + if (!result) { + if (owner()->getType() == CL_MEM_OBJECT_BUFFER) { + amd::Coord3D region(owner()->getSize()); + result = dev().xferMgr().readBuffer(*this, + owner()->getHostMem(), origin, region, Entire); + } + else { + amd::Image& image = static_cast(*owner()); + result = dev().xferMgr().readImage(*this, + owner()->getHostMem(), origin, image.getRegion(), + image.getRowPitch(), image.getSlicePitch(), Entire); + } + } + + // Should never fail + assert(result && "Memory synchronization failed!"); + } +} + +void +Memory::mgpuCacheWriteBack() +{ + // Lock memory object, so only one write back can occur + amd::ScopedLock lock(owner()->lockMemoryOps()); + + // Attempt to allocate a staging buffer if don't have any + if (owner()->getHostMem() == nullptr) { + if (nullptr != owner()->getSvmPtr()) { + owner()->commitSvmMemory(); + owner()->setHostMem(owner()->getSvmPtr()); + } + else { + static const bool forceAllocHostMem = true; + owner()->allocHostMemory(nullptr, forceAllocHostMem); + } + } + + // Make synchronization + if (owner()->getHostMem() != nullptr) { + //! \note Ignore pinning result + bool ok = pinSystemMemory(owner()->getHostMem(), owner()->getSize()); + owner()->cacheWriteBack(); + } +} + /////////////////////////////////roc::Buffer////////////////////////////// Buffer::Buffer(const roc::Device &dev, amd::Memory &owner) @@ -257,7 +598,7 @@ Buffer::Buffer(const roc::Device &dev, size_t size) Buffer::~Buffer() { if (owner() == nullptr) { - dev_.hostFree(deviceMemory_, size()); + dev().hostFree(deviceMemory_, size()); } else { destroy(); @@ -285,18 +626,18 @@ Buffer::destroy() // deallocated later on => avoid double deallocation if (isHostMemDirectAccess()) { if (memFlags & (CL_MEM_USE_HOST_PTR | CL_MEM_ALLOC_HOST_PTR)) { - if (dev_.agent_profile() != HSA_PROFILE_FULL) { + if (dev().agent_profile() != HSA_PROFILE_FULL) { hsa_amd_memory_unlock(owner()->getHostMem()); } } } else { - dev_.memFree(deviceMemory_, size()); + dev().memFree(deviceMemory_, size()); } } if (memFlags & CL_MEM_USE_HOST_PTR) { - if (dev_.agent_profile() == HSA_PROFILE_FULL) { + if (dev().agent_profile() == HSA_PROFILE_FULL) { hsa_memory_deregister(owner()->getHostMem(), size()); } } @@ -306,7 +647,7 @@ bool Buffer::create() { if (owner() == nullptr) { - deviceMemory_ = dev_.hostAlloc(size(), 1, false); + deviceMemory_ = dev().hostAlloc(size(), 1, false); if (deviceMemory_ != nullptr) { flags_ |= HostMemoryDirectAccess; return true; @@ -332,7 +673,6 @@ Buffer::create() const size_t offset = owner()->getOrigin(); deviceMemory_ = parentBuffer->getDeviceMemory() + offset; - flags_ |= SubMemoryObject; flags_ |= parentBuffer->isHostMemDirectAccess() ? HostMemoryDirectAccess : 0; @@ -352,32 +692,35 @@ Buffer::create() // Allocate backing storage in device local memory unless UHP or AHP are set const cl_mem_flags memFlags = owner()->getMemFlags(); if (!(memFlags & (CL_MEM_USE_HOST_PTR | CL_MEM_ALLOC_HOST_PTR))) { - deviceMemory_ = dev_.deviceLocalAlloc(size()); + deviceMemory_ = dev().deviceLocalAlloc(size()); if (deviceMemory_ == NULL) { // TODO: device memory is not enabled yet. // Fallback to system memory if exist. - flags_ |= HostMemoryDirectAccess; - if (dev_.agent_profile() == HSA_PROFILE_FULL && + if (dev().agent_profile() == HSA_PROFILE_FULL && owner()->getHostMem() != NULL) { deviceMemory_ = owner()->getHostMem(); assert( amd::isMultipleOf( deviceMemory_, - static_cast(dev_.info().memBaseAddrAlign_))); + static_cast(dev().info().memBaseAddrAlign_))); return true; } - deviceMemory_ = dev_.hostAlloc(size(), 1, false); + deviceMemory_ = dev().hostAlloc(size(), 1, false); + owner()->setHostMem(deviceMemory_); } assert( amd::isMultipleOf( deviceMemory_, - static_cast(dev_.info().memBaseAddrAlign_))); + static_cast(dev().info().memBaseAddrAlign_))); - if (deviceMemory_ && (memFlags & CL_MEM_COPY_HOST_PTR)) { + // Transfer data only if OCL context has one device. + // Cache coherency layer will update data for multiple devices + if (deviceMemory_ && (memFlags & CL_MEM_COPY_HOST_PTR) && + (owner()->getContext().devices().size() == 1) ) { // To avoid recurssive call to Device::createMemory, we perform // data transfer to the view of the buffer. amd::Buffer *bufferView = new (owner()->getContext()) amd::Buffer( @@ -390,16 +733,12 @@ Buffer::create() bufferView->replaceDeviceMemory(&dev_, devBufferView); - bool ret = dev_.xferMgr().writeBuffer( + bool ret = dev().xferMgr().writeBuffer( owner()->getHostMem(), *devBufferView, amd::Coord3D(0), amd::Coord3D(size()), true); - // Release host memory for single device, - // since runtime copied data - if (owner()->getContext().devices().size() == 1) { - owner()->setHostMem(nullptr); - } - + // Release host memory, since runtime copied data + owner()->setHostMem(nullptr); bufferView->release(); return ret; } @@ -410,7 +749,7 @@ Buffer::create() flags_ |= HostMemoryDirectAccess; - if (dev_.agent_profile() == HSA_PROFILE_FULL) { + if (dev().agent_profile() == HSA_PROFILE_FULL) { deviceMemory_ = owner()->getHostMem(); if (memFlags & CL_MEM_USE_HOST_PTR) { @@ -422,9 +761,8 @@ Buffer::create() if (owner()->getSvmPtr() != owner()->getHostMem()) { if (memFlags & (CL_MEM_USE_HOST_PTR | CL_MEM_ALLOC_HOST_PTR)) { - hsa_agent_t agent = dev_.getBackendDevice(); hsa_status_t status = hsa_amd_memory_lock( - owner()->getHostMem(), owner()->getSize(), &agent, 1, &deviceMemory_); + owner()->getHostMem(), owner()->getSize(), nullptr, 0, &deviceMemory_); if (status != HSA_STATUS_SUCCESS) { deviceMemory_ = nullptr; } @@ -622,7 +960,7 @@ Image::createInteropImage() originalDeviceMemory_=deviceMemory_; - hsa_status_t err=hsa_amd_image_create(dev_.getBackendDevice(), &imageDescriptor_, amdImageDesc_, originalDeviceMemory_, permission_, &hsaImageObject_); + hsa_status_t err=hsa_amd_image_create(dev().getBackendDevice(), &imageDescriptor_, amdImageDesc_, originalDeviceMemory_, permission_, &hsaImageObject_); if(err!=HSA_STATUS_SUCCESS) return false; @@ -654,7 +992,7 @@ Image::create() // Get memory size requirement for device specific image. hsa_status_t status = hsa_ext_image_data_get_info( - dev_.getBackendDevice(), &imageDescriptor_, + dev().getBackendDevice(), &imageDescriptor_, permission_, &deviceImageInfo_); if (status != HSA_STATUS_SUCCESS) { @@ -666,16 +1004,16 @@ Image::create() // support alignment larger than HSA memory region allocation granularity. // In this case, the user manages the alignment. const size_t alloc_size = - (deviceImageInfo_.alignment <= dev_.alloc_granularity()) + (deviceImageInfo_.alignment <= dev().alloc_granularity()) ? deviceImageInfo_.size : deviceImageInfo_.size + deviceImageInfo_.alignment; if (!(owner()->getMemFlags() & CL_MEM_ALLOC_HOST_PTR)) { - originalDeviceMemory_ = dev_.deviceLocalAlloc(alloc_size); + originalDeviceMemory_ = dev().deviceLocalAlloc(alloc_size); } if (originalDeviceMemory_ == NULL) { - originalDeviceMemory_ = dev_.hostAlloc(alloc_size, 1, false); + originalDeviceMemory_ = dev().hostAlloc(alloc_size, 1, false); } deviceMemory_ = reinterpret_cast( @@ -686,7 +1024,7 @@ Image::create() deviceMemory_, static_cast(deviceImageInfo_.alignment))); status = hsa_ext_image_create( - dev_.getBackendDevice(), &imageDescriptor_, deviceMemory_, + dev().getBackendDevice(), &imageDescriptor_, deviceMemory_, permission_, &hsaImageObject_); if (status != HSA_STATUS_SUCCESS) { @@ -712,10 +1050,11 @@ Image::createView(const Memory &parent) } kind_ = parent.getKind(); + version_ = parent.version(); hsa_status_t status; if (kind_ == MEMORY_KIND_INTEROP) { - status = hsa_amd_image_create(dev_.getBackendDevice(), &imageDescriptor_, + status = hsa_amd_image_create(dev().getBackendDevice(), &imageDescriptor_, amdImageDesc_, deviceMemory_, permission_, &hsaImageObject_); } else if (oldestParent->asBuffer()) { @@ -732,15 +1071,15 @@ Image::createView(const Memory &parent) // Make sure the row pitch is aligned to pixels rowPitch = elementSize * - amd::alignUp(rowPitch, dev_.info().imagePitchAlignment_); + amd::alignUp(rowPitch, dev().info().imagePitchAlignment_); - status = hsa_ext_image_create_with_layout(dev_.getBackendDevice(), + status = hsa_ext_image_create_with_layout(dev().getBackendDevice(), &imageDescriptor_, deviceMemory_, permission_, HSA_EXT_IMAGE_DATA_LAYOUT_LINEAR, rowPitch, 0, &hsaImageObject_); } else { - status= hsa_ext_image_create(dev_.getBackendDevice(), &imageDescriptor_, + status= hsa_ext_image_create(dev().getBackendDevice(), &imageDescriptor_, deviceMemory_, permission_, &hsaImageObject_); } @@ -830,7 +1169,7 @@ Image::destroy() { if (hsaImageObject_.handle != 0) { hsa_status_t status = - hsa_ext_image_destroy(dev_.getBackendDevice(), hsaImageObject_); + hsa_ext_image_destroy(dev().getBackendDevice(), hsaImageObject_); assert(status == HSA_STATUS_SUCCESS); } @@ -847,7 +1186,7 @@ Image::destroy() } if (originalDeviceMemory_ != NULL) { - dev_.memFree(originalDeviceMemory_, deviceImageInfo_.size); + dev().memFree(originalDeviceMemory_, deviceImageInfo_.size); } } } diff --git a/projects/clr/rocclr/runtime/device/rocm/rocmemory.hpp b/projects/clr/rocclr/runtime/device/rocm/rocmemory.hpp index 92e945cb13..9b60aaec56 100644 --- a/projects/clr/rocclr/runtime/device/rocm/rocmemory.hpp +++ b/projects/clr/rocclr/runtime/device/rocm/rocmemory.hpp @@ -39,17 +39,21 @@ class Memory : public device::Memory { // Pins system memory associated with this memory object. virtual bool pinSystemMemory(void *hostPtr, // System memory address size_t size // Size of allocated system memory - ) { - Unimplemented(); - return true; - } + ); + + //! Updates device memory from the owner's host allocation + void syncCacheFromHost( + VirtualGPU& gpu, //!< Virtual GPU device object + //! Synchronization flags + device::Memory::SyncFlags syncFlags = device::Memory::SyncFlags() + ); // Immediate blocking write from device cache to owners's backing store. // Marks owner as "current" by resetting the last writer to NULL. - virtual void syncHostFromCache(SyncFlags syncFlags = SyncFlags()) - { - // Need to revisit this when multi-devices is supported. - } + virtual void syncHostFromCache(SyncFlags syncFlags = SyncFlags()); + + //! Allocates host memory for synchronization with MGPU context + void mgpuCacheWriteBack(); // Releases indirect map surface void releaseIndirectMap() { decIndMapCount(); } @@ -78,6 +82,10 @@ class Memory : public device::Memory { MEMORY_KIND getKind() const { return kind_; } + const roc::Device& dev() const { return dev_; } + + size_t version() const { return version_; } + protected: bool allocateMapMemory(size_t allocationSize); @@ -102,13 +110,14 @@ class Memory : public device::Memory { // Track if this memory is interop, lock, gart, or normal. MEMORY_KIND kind_; - private: +private: // Disable copy constructor Memory(const Memory &); // Disable operator= Memory &operator=(const Memory &); + amd::Memory* pinnedMemory_; //!< Memory used as pinned system memory }; class Buffer : public roc::Memory { diff --git a/projects/clr/rocclr/runtime/device/rocm/rocvirtual.cpp b/projects/clr/rocclr/runtime/device/rocm/rocvirtual.cpp index 33f6516d21..543bcc7872 100644 --- a/projects/clr/rocclr/runtime/device/rocm/rocvirtual.cpp +++ b/projects/clr/rocclr/runtime/device/rocm/rocvirtual.cpp @@ -261,11 +261,14 @@ VirtualGPU::processMemObjects( } } else { - Memory* gpuMemory = static_cast(memory->getDeviceMemory(dev())); - if (NULL != gpuMemory) { + Memory* rocMemory = static_cast(memory->getDeviceMemory(dev())); + if (NULL != rocMemory) { + // Synchronize data with other memory instances if necessary + rocMemory->syncCacheFromHost(*this); + const static bool IsReadOnly = false; // Validate SVM passed in the non argument list - memoryDependency().validate(*this, gpuMemory, IsReadOnly); + memoryDependency().validate(*this, rocMemory, IsReadOnly); } else { return false; @@ -305,6 +308,12 @@ VirtualGPU::processMemObjects( else { memory = static_cast(svmMem->getDeviceMemory(dev())); } + // Don't sync for internal objects, + // since they are not shared between devices + if (memory->owner()->getVirtualDevice() == nullptr) { + // Synchronize data with other memory instances if necessary + memory->syncCacheFromHost(*this); + } } if (memory != NULL) { @@ -480,6 +489,8 @@ VirtualGPU::VirtualGPU(Device &device) VirtualGPU::~VirtualGPU() { + releasePinnedMem(); + if (timestamp_ != NULL) { delete timestamp_; timestamp_ = NULL; @@ -821,7 +832,10 @@ void VirtualGPU::submitReadMemory(amd::ReadMemoryCommand &cmd) // Find if virtual address is a CL allocation device::Memory* hostMemory = dev().findMemoryFromVA(cmd.destination(), &offset); - device::Memory *devMem = cmd.source().getDeviceMemory(dev()); + Memory* devMem = dev().getRocMemory(&cmd.source()); + // Synchronize data with other memory instances if necessary + devMem->syncCacheFromHost(*this); + void *dst = cmd.destination(); amd::Coord3D size = cmd.size(); @@ -896,8 +910,14 @@ void VirtualGPU::submitWriteMemory(amd::WriteMemoryCommand &cmd) // Find if virtual address is a CL allocation device::Memory* hostMemory = dev().findMemoryFromVA(cmd.source(), &offset); - device::Memory *devMem = cmd.destination().getDeviceMemory(dev()); - const char *src = static_cast(cmd.source()); + Memory* devMem = dev().getRocMemory(&cmd.destination()); + + // Synchronize memory from host if necessary + device::Memory::SyncFlags syncFlags; + syncFlags.skipEntire_ = cmd.isEntireMemory(); + devMem->syncCacheFromHost(*this, syncFlags); + + const char* src = static_cast(cmd.source()); amd::Coord3D size = cmd.size(); //! @todo add multi-devices synchronization when supported. @@ -1008,11 +1028,16 @@ void VirtualGPU::submitCopyMemory(amd::CopyMemoryCommand &cmd) profilingBegin(cmd); - device::Memory *srcDevMem = cmd.source().getDeviceMemory(dev()); - device::Memory *destDevMem = cmd.destination().getDeviceMemory(dev()); - amd::Coord3D size = cmd.size(); + Memory* srcDevMem = dev().getRocMemory(&cmd.source()); + Memory* dstDevMem = dev().getRocMemory(&cmd.destination()); - //! @todo add multi-devices synchronization when supported. + // Synchronize source and destination memory + device::Memory::SyncFlags syncFlags; + syncFlags.skipEntire_ = cmd.isEntireMemory(); + dstDevMem->syncCacheFromHost(*this, syncFlags); + srcDevMem->syncCacheFromHost(*this); + + amd::Coord3D size = cmd.size(); cl_command_type type = cmd.type(); bool result = false; @@ -1051,31 +1076,31 @@ void VirtualGPU::submitCopyMemory(amd::CopyMemoryCommand &cmd) } result = blitMgr().copyBuffer( - *srcDevMem, *destDevMem, srcOrigin, + *srcDevMem, *dstDevMem, srcOrigin, dstOrigin, size, cmd.isEntireMemory()); break; } case CL_COMMAND_COPY_BUFFER_RECT: { result = blitMgr().copyBufferRect( - *srcDevMem, *destDevMem, cmd.srcRect(), + *srcDevMem, *dstDevMem, cmd.srcRect(), cmd.dstRect(), size, cmd.isEntireMemory()); break; } case CL_COMMAND_COPY_IMAGE: { result = blitMgr().copyImage( - *srcDevMem, *destDevMem, cmd.srcOrigin(), + *srcDevMem, *dstDevMem, cmd.srcOrigin(), cmd.dstOrigin(), size, cmd.isEntireMemory()); break; } case CL_COMMAND_COPY_IMAGE_TO_BUFFER: { result = blitMgr().copyImageToBuffer( - *srcDevMem, *destDevMem, cmd.srcOrigin(), + *srcDevMem, *dstDevMem, cmd.srcOrigin(), cmd.dstOrigin(), size, cmd.isEntireMemory()); break; } case CL_COMMAND_COPY_BUFFER_TO_IMAGE: { result = blitMgr().copyBufferToImage( - *srcDevMem, *destDevMem, cmd.srcOrigin(), + *srcDevMem, *dstDevMem, cmd.srcOrigin(), cmd.dstOrigin(), size, cmd.isEntireMemory()); break; } @@ -1121,7 +1146,7 @@ void VirtualGPU::submitMapMemory(amd::MapMemoryCommand &cmd) //! @todo add multi-devices synchronization when supported. - roc::Memory *devMemory = reinterpret_cast( + roc::Memory* devMemory = reinterpret_cast( cmd.memory().getDeviceMemory(dev(), false)); cl_command_type type = cmd.type(); @@ -1139,12 +1164,17 @@ void VirtualGPU::submitMapMemory(amd::MapMemoryCommand &cmd) mapFlag, cmd.isEntireMemory()); // Sync to the map target. - if (devMemory->isHostMemDirectAccess()) { - // Add memory to VA cache, so rutnime can detect direct access to VA - dev().addVACache(devMemory); + // If we have host memory, use it + if (devMemory->owner()->getHostMem() != nullptr) { + // Target is the backing store, so just ensure that owner is up-to-date + devMemory->owner()->cacheWriteBack(); + + if (devMemory->isHostMemDirectAccess()) { + // Add memory to VA cache, so rutnime can detect direct access to VA + dev().addVACache(devMemory); + } } - if ((!devMemory->isHostMemDirectAccess()) && - (mapFlag & (CL_MAP_READ | CL_MAP_WRITE))) { + else if (mapFlag & (CL_MAP_READ | CL_MAP_WRITE)) { bool result = false; roc::Memory *hsaMemory = static_cast(devMemory); @@ -1176,7 +1206,6 @@ void VirtualGPU::submitMapMemory(amd::MapMemoryCommand &cmd) *hsaMemory, static_cast(hostPtr)+origin[0], origin, size, cmd.isEntireMemory()); } - } else if (type == CL_COMMAND_MAP_IMAGE) { amd::Image* image = cmd.memory().asImage(); @@ -1225,11 +1254,19 @@ void VirtualGPU::submitUnmapMemory(amd::UnmapMemoryCommand &cmd) // Force buffer write for IMAGE1D_BUFFER bool imageBuffer = (cmd.memory().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER); - if (devMemory->isHostMemDirectAccess()) { - // Remove memory from VA cache - dev().removeVACache(devMemory); + // We used host memory + if (devMemory->owner()->getHostMem() != nullptr) { + if (mapInfo->isUnmapWrite()) { + // Target is the backing store, so sync + devMemory->owner()->signalWrite(nullptr); + devMemory->syncCacheFromHost(*this); + } + if (devMemory->isHostMemDirectAccess()) { + // Remove memory from VA cache + dev().removeVACache(devMemory); + } } - if (mapInfo->isUnmapWrite()) { + else if (mapInfo->isUnmapWrite()) { // Commit the changes made by the user. if (!devMemory->isHostMemDirectAccess()) { bool result = false; @@ -1299,9 +1336,13 @@ void VirtualGPU::submitFillMemory(amd::FillMemoryCommand &cmd) profilingBegin(cmd); - device::Memory *devMemory = cmd.memory().getDeviceMemory(dev(), false); + Memory* memory = dev().getRocMemory(&cmd.memory()); - //! @todo add multi-devices synchronization when supported. + bool entire = cmd.isEntireMemory(); + // Synchronize memory from host if necessary + device::Memory::SyncFlags syncFlags; + syncFlags.skipEntire_ = entire; + memory->syncCacheFromHost(*this, syncFlags); cl_command_type type = cmd.type(); bool result = false; @@ -1335,14 +1376,12 @@ void VirtualGPU::submitFillMemory(amd::FillMemoryCommand &cmd) patternSize = elemSize; } result = blitMgr().fillBuffer( - *devMemory, pattern, patternSize, origin, size, - cmd.isEntireMemory()); + *memory, pattern, patternSize, origin, size, entire); break; } case CL_COMMAND_FILL_IMAGE: { result = blitMgr().fillImage( - *devMemory, cmd.pattern(), cmd.origin(), cmd.size(), - cmd.isEntireMemory()); + *memory, cmd.pattern(), cmd.origin(), cmd.size(), entire); break; } default: @@ -1367,21 +1406,21 @@ void VirtualGPU::submitMigrateMemObjects(amd::MigrateMemObjectsCommand &vcmd) profilingBegin(vcmd); - std::vector::const_iterator itr; - - for (itr = vcmd.memObjects().begin(); - itr != vcmd.memObjects().end(); - itr++) { + for (auto itr : vcmd.memObjects()) { // Find device memory - device::Memory *m = (*itr)->getDeviceMemory(dev()); - roc::Memory *memory = static_cast(m); + Memory* memory = dev().getRocMemory(&(*itr)); if (vcmd.migrationFlags() & CL_MIGRATE_MEM_OBJECT_HOST) { - //! @todo revisit this when multi devices is supported. - } else if (vcmd.migrationFlags() & - CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED) { - //! @todo revisit this when multi devices is supported. - } else { + memory->mgpuCacheWriteBack(); + } + else if (vcmd.migrationFlags() & CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED) { + // Synchronize memory from host if necessary. + // The sync function will perform memory migration from + // another device if necessary + device::Memory::SyncFlags syncFlags; + memory->syncCacheFromHost(*this, syncFlags); + } + else { LogWarning("Unknown operation for memory migration!"); } } @@ -1638,8 +1677,7 @@ VirtualGPU::submitKernelInternal( argPtr = addArg(argPtr, &globalAddress, arg->size_, arg->alignment_); //! @todo Compiler has to return read/write attributes - const cl_mem_flags flags = mem->getMemFlags(); - if (!flags || (flags & (CL_MEM_READ_WRITE | CL_MEM_WRITE_ONLY))) { + if ((mem->getMemFlags() & CL_MEM_READ_ONLY) == 0) { mem->signalWrite(&dev()); } break; @@ -1677,8 +1715,7 @@ VirtualGPU::submitKernelInternal( } //! @todo Compiler has to return read/write attributes - const cl_mem_flags flags = mem->getMemFlags(); - if (!flags || (flags & (CL_MEM_READ_WRITE | CL_MEM_WRITE_ONLY))) { + if ((mem->getMemFlags() & CL_MEM_READ_ONLY) == 0) { mem->signalWrite(&dev()); } break; @@ -1828,7 +1865,7 @@ void VirtualGPU::flush(amd::Command *list, bool wait) { releaseGpuMemoryFence(); updateCommandsState(list); - // Rlease all pinned memory + // Release all pinned memory releasePinnedMem(); } diff --git a/projects/clr/rocclr/runtime/platform/memory.cpp b/projects/clr/rocclr/runtime/platform/memory.cpp index 92e0750e18..9370e6c53f 100644 --- a/projects/clr/rocclr/runtime/platform/memory.cpp +++ b/projects/clr/rocclr/runtime/platform/memory.cpp @@ -125,6 +125,9 @@ Memory::Memory( parent_->retain(); parent_->isParent_ = true; + if (parent.getHostMem() != nullptr) { + setHostMem(reinterpret_cast
(parent.getHostMem()) + origin); + } // Inherit memory flags from the parent if ((flags_ & (CL_MEM_READ_WRITE | CL_MEM_READ_ONLY | CL_MEM_WRITE_ONLY)) == 0) { @@ -407,7 +410,7 @@ Memory::~Memory() // Release the parent. if (NULL != parent_) { // Update cache if runtime destroys a subbuffer - if (NULL != parent_->getHostMem()) { + if (NULL != parent_->getHostMem() && (vDev_ == NULL)) { cacheWriteBack(); } parent_->removeSubBuffer(this); @@ -567,8 +570,9 @@ Pipe::initDeviceMemory() Image::Image( const Format& format, Image& parent, - uint baseMipLevel) - : Memory(parent, 0, 0, parent.getWidth() * parent.getHeight() * + uint baseMipLevel, + cl_mem_flags flags) + : Memory(parent, flags, 0, parent.getWidth() * parent.getHeight() * parent.getDepth() * format.getElementSize()) , impl_(format, Coord3D(parent.getWidth() * parent.getImageFormat().getElementSize() / @@ -1193,12 +1197,13 @@ Image::createView( const Context& context, const Format& format, device::VirtualDevice* vDev, - uint baseMipLevel) + uint baseMipLevel, + cl_mem_flags flags) { Image* view = NULL; // Find the image dimensions and create a corresponding object - view = new (context) Image(format, *this, baseMipLevel); + view = new (context) Image(format, *this, baseMipLevel, flags); // Set GPU virtual device for this view view->setVirtualDevice(vDev); diff --git a/projects/clr/rocclr/runtime/platform/memory.hpp b/projects/clr/rocclr/runtime/platform/memory.hpp index a2d3b4dd5a..61e011cc96 100644 --- a/projects/clr/rocclr/runtime/platform/memory.hpp +++ b/projects/clr/rocclr/runtime/platform/memory.hpp @@ -170,7 +170,7 @@ protected: bool isParent_; //!< This object is a parent device::VirtualDevice* vDev_; //!< Memory object belongs to a virtual device only bool forceSysMemAlloc_; //!< Forces system memory allocation - std::atomic_uint mapCount_; //!< Keep track of number of mappings for a memory object + std::atomic_uint mapCount_; //!< Keep track of number of mappings for a memory object void * svmHostAddress_; //!< svm host address; bool svmPtrCommited_; //!< svm host address committed flag; bool canBeCached_; //!< flag to if the object can be cached; @@ -516,7 +516,8 @@ protected: Image( const Format& format, Image& parent, - uint baseMipLevel = 0); + uint baseMipLevel = 0, + cl_mem_flags flags = 0); ///! Initializes the device memory array which is nested // after'Image' object in memory layout. @@ -593,7 +594,8 @@ public: const Context& context, //!< Context for a view creation const Format& format, //!< The new format for a view device::VirtualDevice* vDev, //!< Virtual device object - uint baseMipLevel = 0 //!< Base mip level for a view + uint baseMipLevel = 0, //!< Base mip level for a view + cl_mem_flags flags = 0 //!< Memory allocation flags ); //! Returns the impl for this image.