diff --git a/projects/clr/rocclr/runtime/device/rocm/rocblit.cpp b/projects/clr/rocclr/runtime/device/rocm/rocblit.cpp index 83c6988a7d..a557565ee8 100644 --- a/projects/clr/rocclr/runtime/device/rocm/rocblit.cpp +++ b/projects/clr/rocclr/runtime/device/rocm/rocblit.cpp @@ -1,528 +1,996 @@ // -// Copyright (c) 2013 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved. // - #include "platform/commandqueue.hpp" #include "device/rocm/rocdevice.hpp" #include "device/rocm/rocblit.hpp" #include "device/rocm/rocmemory.hpp" #include "device/rocm/rocvirtual.hpp" #include "utils/debug.hpp" +#include namespace roc { - -void -FindPinSize( - size_t& pinSize, const amd::Coord3D& size, - size_t& rowPitch, size_t& slicePitch, const Image& image) +DmaBlitManager::DmaBlitManager(VirtualGPU& gpu, Setup setup) + : HostBlitManager(gpu, setup) + , MinSizeForPinnedTransfer(dev().settings().pinnedMinXferSize_) + , completeOperation_(false) + , context_(NULL) { - size_t elementSize = image.owner()->asImage()->getImageFormat().getElementSize(); - pinSize = size[0] * elementSize; - if ((rowPitch == 0) || (rowPitch == pinSize)) { - rowPitch = 0; +} + +inline void +DmaBlitManager::synchronize() const +{ + gpu().releaseGpuMemoryFence(); + + if (syncOperation_) { +// gpu().waitAllEngines(); + } +} + +inline Memory& +DmaBlitManager::gpuMem(device::Memory& mem) const +{ + return static_cast(mem); +} + +bool +DmaBlitManager::readMemoryStaged( + Memory& srcMemory, + void* dstHost, + Memory& xferBuf, + size_t origin, + size_t& offset, + size_t& totalSize, + size_t xferSize) const +{ + const_address src = srcMemory.getDeviceMemory(); + address staging = xferBuf.getDeviceMemory(); + + // Copy data from device to host + src += origin + offset; + address dst = reinterpret_cast
(dstHost) + offset; + bool ret = hsaCopyStaged(src, dst, totalSize, staging, false); + + return ret; +} + +bool +DmaBlitManager::readBuffer( + device::Memory& srcMemory, + void* dstHost, + const amd::Coord3D& origin, + const amd::Coord3D& size, + bool entire) const +{ + // Use host copy if memory has direct access + if (setup_.disableReadBuffer_ || gpuMem(srcMemory).isHostMemDirectAccess()) { + return HostBlitManager::readBuffer( + srcMemory, dstHost, origin, size, entire); } else { - pinSize = rowPitch; - } + size_t srcSize = size[0]; + size_t offset = 0; + size_t pinSize = dev().settings().pinnedXferSize_; + pinSize = std::min(pinSize, srcSize); - // Calculate the pin size, which should be equal to the copy size - for (uint i = 1; i < 3; ++i) { - pinSize *= size[i]; - if (i == 1) { - if ((slicePitch == 0) || (slicePitch == pinSize)) { - slicePitch = 0; - } - else { - if (image.getHsaImageDescriptor().geometry != HSA_EXT_IMAGE_GEOMETRY_1DA) { - pinSize = slicePitch; + // Check if a pinned transfer can be executed + if (pinSize && (srcSize > MinSizeForPinnedTransfer)) { + // Allign offset to 4K boundary (Vista/Win7 limitation) + char* tmpHost = const_cast( + amd::alignDown(reinterpret_cast(dstHost), + PinnedMemoryAlignment)); + + // Find the partial size for unaligned copy + size_t partial = reinterpret_cast(dstHost) - tmpHost; + + amd::Memory* pinned = NULL; + bool first = true; + size_t tmpSize; + size_t pinAllocSize; + + // Copy memory, using pinning + while (srcSize > 0) { + // If it's the first iterarion, then readjust the copy size + // to include alignment + if (first) { + pinAllocSize = amd::alignUp(pinSize + partial, + PinnedMemoryAlignment); + tmpSize = std::min(pinAllocSize - partial, srcSize); + first = false; } else { - pinSize = slicePitch * size[i]; + tmpSize = std::min(pinSize, srcSize); + pinAllocSize = amd::alignUp(tmpSize, PinnedMemoryAlignment); + partial = 0; + } + amd::Coord3D dst(partial, 0, 0); + amd::Coord3D srcPin(origin[0] + offset, 0, 0); + amd::Coord3D copySizePin(tmpSize, 0, 0); + size_t partial2; + + // Allocate a GPU resource for pinning + pinned = pinHostMemory(tmpHost, pinAllocSize, partial2); + if (pinned != NULL) { + // Get device memory for this virtual device + Memory* dstMemory = dev().getRocMemory(pinned); + + if (!hsaCopy(gpuMem(srcMemory), *dstMemory, + srcPin, dst, copySizePin)) { + LogWarning("DmaBlitManager::readBuffer failed a pinned copy!"); + gpu().addPinnedMem(pinned); + break; + } + gpu().addPinnedMem(pinned); + } + else { + LogWarning("DmaBlitManager::readBuffer failed to pin a resource!"); + break; + } + srcSize -= tmpSize; + offset += tmpSize; + tmpHost = reinterpret_cast(tmpHost) + tmpSize + partial; + } + } + + if (0 != srcSize) { + Memory& xferBuf = dev().xferRead().acquire(); + + // Read memory using a staging resource + if (!readMemoryStaged(gpuMem(srcMemory), dstHost, xferBuf, origin[0], + offset, srcSize, srcSize)) { + LogError("DmaBlitManager::readBuffer failed!"); + return false; + } + + dev().xferRead().release(gpu(), xferBuf); + } + } + + return true; +} + +bool +DmaBlitManager::readBufferRect( + device::Memory& srcMemory, + void* dstHost, + const amd::BufferRect& bufRect, + const amd::BufferRect& hostRect, + const amd::Coord3D& size, + bool entire) const +{ + // Use host copy if memory has direct access + if (setup_.disableReadBufferRect_ || gpuMem(srcMemory).isHostMemDirectAccess()) { + return HostBlitManager::readBufferRect( + srcMemory, dstHost, bufRect, hostRect, size, entire); + } + else { + Memory& xferBuf = dev().xferRead().acquire(); + address staging = xferBuf.getDeviceMemory(); + const_address src = gpuMem(srcMemory).getDeviceMemory(); + + size_t srcOffset; + size_t dstOffset; + + for (size_t z = 0; z < size[2]; ++z) { + for (size_t y = 0; y < size[1]; ++y) { + srcOffset = bufRect.offset(0, y, z); + dstOffset = hostRect.offset(0, y, z); + + // Copy data from device to host - line by line + address dst = reinterpret_cast
(dstHost) + dstOffset; + src += srcOffset; + bool retval = hsaCopyStaged(src, dst, size[0], staging, false); + if (!retval) { + return retval; } } } + dev().xferRead().release(gpu(), xferBuf); } + + return true; } -HsaBlitManager::HsaBlitManager(device::VirtualDevice& vDev, Setup setup) - : HostBlitManager(vDev, setup), - roc_device_(reinterpret_cast(dev_)) { - completion_signal_.handle = 0; -} - -bool HsaBlitManager::hsaCopy(const void *hostSrc, void *hostDst, - uint32_t size, bool hostToDev) const { - - // No allocation is necessary for Full Profile - hsa_status_t status; - if (roc_device_.agent_profile() == HSA_PROFILE_FULL) { - status = hsa_memory_copy(hostDst, hostSrc, size); - if (status != HSA_STATUS_SUCCESS) { - LogPrintfError("Hsa copy of data failed with code %d", status); - } - return (status == HSA_STATUS_SUCCESS); - } - - // Allocate requested size of memory - size_t align = 0x04; - bool atomics = false; - void *hsaBuffer = NULL; - hsaBuffer = roc_device_.hostAlloc(size, align, false); - if (hsaBuffer == NULL) { - LogError("Hsa buffer allocation failed with code"); - return false; - } - - const hsa_signal_value_t kInitVal = 1; - hsa_signal_store_relaxed(completion_signal_, kInitVal); - - // Copy data from Host to Device - if (hostToDev) { - memcpy(hsaBuffer, hostSrc, size); - status = hsa_amd_memory_async_copy( - hostDst, roc_device_.getBackendDevice(), hsaBuffer, - roc_device_.getCpuAgent(), size, 0, NULL, completion_signal_); - if (status == HSA_STATUS_SUCCESS) { - hsa_signal_value_t val = - hsa_signal_wait_acquire(completion_signal_, HSA_SIGNAL_CONDITION_EQ, 0, - uint64_t(-1), HSA_WAIT_STATE_ACTIVE); - - if (val != (kInitVal - 1)) { - LogError("Async copy failed"); - status = HSA_STATUS_ERROR; - } +bool +DmaBlitManager::readImage( + device::Memory& srcMemory, + void* dstHost, + const amd::Coord3D& origin, + const amd::Coord3D& size, + size_t rowPitch, + size_t slicePitch, + bool entire) const +{ + if (setup_.disableReadImage_) { + return HostBlitManager::readImage(srcMemory, dstHost, + origin, size, rowPitch, slicePitch, entire); } else { - LogPrintfError("Hsa copy from host to device failed with code %d", status); + //! @todo Add HW accelerated path + return HostBlitManager::readImage(srcMemory, dstHost, + origin, size, rowPitch, slicePitch, entire); } - roc_device_.hostFree(hsaBuffer, size); - return (status == HSA_STATUS_SUCCESS); - } + return true; +} - // Copy data from Device to Host - status = hsa_amd_memory_async_copy(hsaBuffer, roc_device_.getCpuAgent(), - hostSrc, roc_device_.getBackendDevice(), - size, 0, NULL, completion_signal_); - if (status == HSA_STATUS_SUCCESS) { - hsa_signal_value_t val = hsa_signal_wait_acquire( - completion_signal_, HSA_SIGNAL_CONDITION_EQ, 0, uint64_t(-1), - HSA_WAIT_STATE_ACTIVE); +bool +DmaBlitManager::writeMemoryStaged( + const void* srcHost, + Memory& dstMemory, + Memory& xferBuf, + size_t origin, + size_t& offset, + size_t& totalSize, + size_t xferSize) const +{ + address dst = dstMemory.getDeviceMemory(); + address staging = xferBuf.getDeviceMemory(); - if (val != (kInitVal - 1)) { - LogError("Async copy failed"); - status = HSA_STATUS_ERROR; + // Copy data from host to device + dst += origin + offset; + const_address src = reinterpret_cast(srcHost) + offset; + bool retval = hsaCopyStaged(src, dst, totalSize, staging, true); + + return retval; +} + +bool +DmaBlitManager::writeBuffer( + const void* srcHost, + device::Memory& dstMemory, + const amd::Coord3D& origin, + const amd::Coord3D& size, + bool entire) const +{ + // Use host copy if memory has direct access + if (setup_.disableWriteBuffer_ || + gpuMem(dstMemory).isHostMemDirectAccess()) { + return HostBlitManager::writeBuffer( + srcHost, dstMemory, origin, size, entire); + } + else { + size_t dstSize = size[0]; + size_t tmpSize = 0; + size_t offset = 0; + size_t pinSize = dev().settings().pinnedXferSize_; + pinSize = std::min(pinSize, dstSize); + + // Check if a pinned transfer can be executed + if (pinSize && (dstSize > MinSizeForPinnedTransfer)) { + // Allign offset to 4K boundary (Vista/Win7 limitation) + char* tmpHost = const_cast( + amd::alignDown(reinterpret_cast(srcHost), + PinnedMemoryAlignment)); + + // Find the partial size for unaligned copy + size_t partial = reinterpret_cast(srcHost) - tmpHost; + + amd::Memory* pinned = NULL; + bool first = true; + size_t tmpSize; + size_t pinAllocSize; + + // Copy memory, using pinning + while (dstSize > 0) { + // If it's the first iterarion, then readjust the copy size + // to include alignment + if (first) { + pinAllocSize = amd::alignUp(pinSize + partial, + PinnedMemoryAlignment); + tmpSize = std::min(pinAllocSize - partial, dstSize); + first = false; + } + else { + tmpSize = std::min(pinSize, dstSize); + pinAllocSize = amd::alignUp(tmpSize, PinnedMemoryAlignment); + partial = 0; + } + amd::Coord3D src(partial, 0, 0); + amd::Coord3D dstPin(origin[0] + offset, 0, 0); + amd::Coord3D copySizePin(tmpSize, 0, 0); + size_t partial2; + + // Allocate a GPU resource for pinning + pinned = pinHostMemory(tmpHost, pinAllocSize, partial2); + + if (pinned != NULL) { + // Get device memory for this virtual device + Memory* srcMemory = dev().getRocMemory(pinned); + + if (!hsaCopy(*srcMemory, gpuMem(dstMemory), src, dstPin, + copySizePin)) { + LogWarning("DmaBlitManager::writeBuffer failed a pinned copy!"); + gpu().addPinnedMem(pinned); + break; + } + gpu().addPinnedMem(pinned); + } + else { + LogWarning("DmaBlitManager::writeBuffer failed to pin a resource!"); + break; + } + dstSize -= tmpSize; + offset += tmpSize; + tmpHost = reinterpret_cast(tmpHost) + tmpSize + partial; + } + } + + if (dstSize != 0) { + Memory& xferBuf = dev().xferWrite().acquire(); + + // Write memory using a staging resource + if (!writeMemoryStaged(srcHost, gpuMem(dstMemory), xferBuf, origin[0], + offset, dstSize, dstSize)) { + LogError("DmaBlitManager::writeBuffer failed!"); + return false; + } + + gpu().addXferWrite(xferBuf); + } } - if (status == HSA_STATUS_SUCCESS) { - memcpy(hostDst, hsaBuffer, size); + return true; +} + +bool +DmaBlitManager::writeBufferRect( + const void* srcHost, + device::Memory& dstMemory, + const amd::BufferRect& hostRect, + const amd::BufferRect& bufRect, + const amd::Coord3D& size, + bool entire) const +{ + // Use host copy if memory has direct access + if (setup_.disableWriteBufferRect_ || dstMemory.isHostMemDirectAccess()) { + return HostBlitManager::writeBufferRect( + srcHost, dstMemory, hostRect, bufRect, size, entire); } - } else { - LogPrintfError("Hsa copy from device to host failed with code %d", status); - } - - roc_device_.hostFree(hsaBuffer, size); - return (status == HSA_STATUS_SUCCESS); -} + else { + Memory& xferBuf = dev().xferWrite().acquire(); + address staging = xferBuf.getDeviceMemory(); + address dst = static_cast(dstMemory).getDeviceMemory(); -bool HsaBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost, - const amd::Coord3D& origin, - const amd::Coord3D& size, bool entire) const { - hsa_memory_register(dstHost, size[0]); - void* src = static_cast(srcMemory).getDeviceMemory(); + size_t srcOffset; + size_t dstOffset; - // Copy data from device to host - const void *srcDev = reinterpret_cast(src) + origin[0]; - bool retval = hsaCopy(srcDev, dstHost, size[0], false); + for (size_t z = 0; z < size[2]; ++z) { + for (size_t y = 0; y < size[1]; ++y) { + srcOffset = hostRect.offset(0, y, z); + dstOffset = bufRect.offset(0, y, z); - hsa_memory_deregister(dstHost, size[0]); - return retval; -} - -bool HsaBlitManager::readBufferRect(device::Memory& srcMemory, void* dst, - const amd::BufferRect& bufRect, - const amd::BufferRect& hostRect, - const amd::Coord3D& size, - bool entire) const { - void* src = static_cast(srcMemory).getDeviceMemory(); - - size_t srcOffset; - size_t dstOffset; - - for (size_t z = 0; z < size[2]; ++z) { - for (size_t y = 0; y < size[1]; ++y) { - srcOffset = bufRect.offset(0, y, z); - dstOffset = hostRect.offset(0, y, z); - - // Copy data from device to host - line by line - void *dstHost = reinterpret_cast
(dst) + dstOffset; - const void *srcDev = reinterpret_cast(src) + srcOffset; - bool retval = hsaCopy(srcDev, dstHost, size[0], false); - if (!retval) { - return retval; - } - } - } - - return true; -} - -static bool hsaCopyImageToBuffer(hsa_agent_t agent, - hsa_ext_image_t srcImage, - void* dstBuffer, const amd::Coord3D& srcOrigin, - const amd::Coord3D& dstOrigin, - const amd::Coord3D& size, bool entire, - size_t rowPitch, size_t slicePitch) { - hsa_ext_image_region_t image_region; - image_region.offset.x = srcOrigin[0]; - image_region.offset.y = srcOrigin[1]; - image_region.offset.z = srcOrigin[2]; - image_region.range.x = size[0]; - image_region.range.y = size[1]; - image_region.range.z = size[2]; - - char *dstHost = ((char*)dstBuffer) + dstOrigin[0]; - - hsa_status_t status = hsa_ext_image_export(agent, srcImage, dstHost, rowPitch, - slicePitch, &image_region); - return (status == HSA_STATUS_SUCCESS); -} - -bool HsaBlitManager::readImage(device::Memory& srcMemory, void* dstHost, - const amd::Coord3D& origin, - const amd::Coord3D& size, size_t rowPitch, - size_t slicePitch, bool entire) const { - roc::Image* srcImage = (roc::Image*)&srcMemory; - - void* svmDstHost = NULL; - size_t pinSize = 0; - FindPinSize(pinSize, size, rowPitch, slicePitch, *srcImage); - - hsa_agent_t agent = gpu().gpu_device(); - - hsa_status_t status = hsa_amd_memory_lock(dstHost, pinSize, - &agent, 1, &svmDstHost); - - if (status != HSA_STATUS_SUCCESS) { - return false; - } - - bool retval = hsaCopyImageToBuffer(agent, srcImage->getHsaImageObject(), - svmDstHost, origin, amd::Coord3D(0), size, entire, - rowPitch, slicePitch); - hsa_amd_memory_unlock(dstHost); - return retval; -} - -bool HsaBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemory, - const amd::Coord3D& origin, - const amd::Coord3D& size, bool entire) const { - hsa_memory_register(const_cast(srcHost), size[0]); - void* dst = static_cast(dstMemory).getDeviceMemory(); - - // Copy data from host to device - void *dstDev = reinterpret_cast
(dst) + origin[0]; - bool retval = hsaCopy(srcHost, dstDev, size[0], true); - - hsa_memory_deregister(const_cast(srcHost), size[0]); - return retval; -} - -bool HsaBlitManager::writeBufferRect(const void* src, - device::Memory& dstMemory, - const amd::BufferRect& hostRect, - const amd::BufferRect& bufRect, - const amd::Coord3D& size, - bool entire) const { - void* dst = static_cast(dstMemory).getDeviceMemory(); - - size_t srcOffset; - size_t dstOffset; - - for (size_t z = 0; z < size[2]; ++z) { - for (size_t y = 0; y < size[1]; ++y) { - srcOffset = hostRect.offset(0, y, z); - dstOffset = bufRect.offset(0, y, z); - - // Copy data from host to device - line by line - void *dstDev = reinterpret_cast
(dst) + dstOffset; - const void *srcHost = reinterpret_cast(src) + srcOffset; - bool retval = hsaCopy(srcHost, dstDev, size[0], true); - if (!retval) { - return retval; - } - } - } - - return true; -} - -bool hsaCopyBufferToImage(hsa_agent_t agent, const void* srcBuffer, - hsa_ext_image_t dstImage, - const amd::Coord3D& srcOrigin, - const amd::Coord3D& dstOrigin, - const amd::Coord3D& size, bool entire, - size_t rowPitch, size_t slicePitch) { - char* srcHost = ((char*)srcBuffer) + srcOrigin[0]; - - hsa_ext_image_region_t image_region; - image_region.offset.x = dstOrigin[0]; - image_region.offset.y = dstOrigin[1]; - image_region.offset.z = dstOrigin[2]; - image_region.range.x = size[0]; - image_region.range.y = size[1]; - image_region.range.z = size[2]; - - hsa_status_t status = hsa_ext_image_import( - agent, srcHost, rowPitch, slicePitch, dstImage, &image_region); - return (status == HSA_STATUS_SUCCESS); -} - -bool HsaBlitManager::writeImage(const void* srcHost, device::Memory& dstMemory, - const amd::Coord3D& origin, - const amd::Coord3D& size, size_t rowPitch, - size_t slicePitch, bool entire) const { - roc::Image* image = (roc::Image*)&dstMemory; - - void* svmSrcHost = NULL; - size_t pinSize = 0; - FindPinSize(pinSize, size, rowPitch, slicePitch, *image); - - hsa_agent_t agent = gpu().gpu_device(); - - hsa_status_t status = hsa_amd_memory_lock(const_cast(srcHost), pinSize, - &agent, 1, &svmSrcHost); - - if (status != HSA_STATUS_SUCCESS) { - return false; - } - - bool retval = hsaCopyBufferToImage(agent, svmSrcHost, - image->getHsaImageObject(), amd::Coord3D(0), - origin, size, entire, rowPitch, slicePitch); - - hsa_amd_memory_unlock(const_cast(srcHost)); - - return retval; -} - -bool HsaBlitManager::copyBuffer(device::Memory& srcMemory, - device::Memory& dstMemory, - const amd::Coord3D& srcOrigin, - const amd::Coord3D& dstOrigin, - const amd::Coord3D& size, bool entire) const { - void* src = static_cast(srcMemory).getDeviceMemory(); - void* dst = static_cast(dstMemory).getDeviceMemory(); - - if (srcMemory.isHostMemDirectAccess() && dstMemory.isHostMemDirectAccess()) { - if (srcMemory.owner()->getMemFlags() & CL_MEM_USE_HOST_PTR) { - src = srcMemory.owner()->getHostMem(); + // Copy data from host to device - line by line + dst += dstOffset; + const_address src = reinterpret_cast(srcHost) + srcOffset; + bool retval = hsaCopyStaged(src, dst, size[0], staging, true); + if (!retval) { + return retval; + } + } + } + gpu().addXferWrite(xferBuf); } - if (dstMemory.owner()->getMemFlags() & CL_MEM_USE_HOST_PTR) { - dst = dstMemory.owner()->getHostMem(); - } - } - - const hsa_agent_t src_agent = (srcMemory.isHostMemDirectAccess()) - ? roc_device_.getCpuAgent() - : roc_device_.getBackendDevice(); - - const hsa_agent_t dst_agent = (dstMemory.isHostMemDirectAccess()) - ? roc_device_.getCpuAgent() - : roc_device_.getBackendDevice(); - - // Straight forward buffer copy - const hsa_signal_value_t kInitVal = 1; - hsa_signal_store_relaxed(completion_signal_, kInitVal); - hsa_status_t status = hsa_amd_memory_async_copy( - (reinterpret_cast
(dst) + dstOrigin[0]), dst_agent, - (reinterpret_cast(src) + srcOrigin[0]), src_agent, size[0], - 0, NULL, completion_signal_); - if (status != HSA_STATUS_SUCCESS) { - LogPrintfError("DMA buffer failed with code %d", status); - return false; - } - - hsa_signal_value_t val = - hsa_signal_wait_acquire(completion_signal_, HSA_SIGNAL_CONDITION_EQ, 0, - uint64_t(-1), HSA_WAIT_STATE_ACTIVE); - - if (val != (kInitVal - 1)) { - LogError("Async copy failed"); - return false; - } - - return true; + return true; } -bool HsaBlitManager::copyBufferRect(device::Memory& srcMemory, - device::Memory& dstMemory, - const amd::BufferRect& srcRect, - const amd::BufferRect& dstRect, - const amd::Coord3D& size, - bool entire) const { - void* src = static_cast(srcMemory).getDeviceMemory(); - void* dst = static_cast(dstMemory).getDeviceMemory(); +bool +DmaBlitManager::writeImage( + const void* srcHost, + device::Memory& dstMemory, + const amd::Coord3D& origin, + const amd::Coord3D& size, + size_t rowPitch, + size_t slicePitch, + bool entire) const +{ + if (setup_.disableWriteImage_) { + return HostBlitManager::writeImage( + srcHost, dstMemory, origin, size, rowPitch, slicePitch, entire); + } + else { + //! @todo Add HW accelerated path + return HostBlitManager::writeImage( + srcHost, dstMemory, origin, size, rowPitch, slicePitch, entire); + } - const hsa_signal_value_t kInitVal = size[2] * size[1]; - hsa_signal_store_relaxed(completion_signal_, kInitVal); + return true; +} - for (size_t z = 0; z < size[2]; ++z) { - for (size_t y = 0; y < size[1]; ++y) { - size_t srcOffset = srcRect.offset(0, y, z); - size_t dstOffset = dstRect.offset(0, y, z); +bool +DmaBlitManager::copyBuffer( + device::Memory& srcMemory, + device::Memory& dstMemory, + const amd::Coord3D& srcOrigin, + const amd::Coord3D& dstOrigin, + const amd::Coord3D& size, + bool entire) const +{ + if (setup_.disableCopyBuffer_ || + (gpuMem(srcMemory).isHostMemDirectAccess() && + (dev().agent_profile() != HSA_PROFILE_FULL) && + gpuMem(dstMemory).isHostMemDirectAccess())) { + return HostBlitManager::copyBuffer( + srcMemory, dstMemory, srcOrigin, dstOrigin, size); + } + else { + return hsaCopy(gpuMem(srcMemory), gpuMem(dstMemory), + srcOrigin, dstOrigin, size); + } - // Copy memory line by line - hsa_status_t status = hsa_amd_memory_async_copy( - (reinterpret_cast
(dst) + dstOffset), - roc_device_.getBackendDevice(), - (reinterpret_cast(src) + srcOffset), - roc_device_.getBackendDevice(), size[0], 0, NULL, - completion_signal_); - if (status != HSA_STATUS_SUCCESS) { - LogPrintfError("DMA buffer failed with code %d", status); + return true; +} + +bool +DmaBlitManager::copyBufferRect( + device::Memory& srcMemory, + device::Memory& dstMemory, + const amd::BufferRect& srcRect, + const amd::BufferRect& dstRect, + const amd::Coord3D& size, + bool entire) const +{ + if (setup_.disableCopyBufferRect_ || + (gpuMem(srcMemory).isHostMemDirectAccess() && + gpuMem(dstMemory).isHostMemDirectAccess())) { + return HostBlitManager::copyBufferRect( + srcMemory, dstMemory, srcRect, dstRect, size, entire); + } + else { return false; - } + void* src = gpuMem(srcMemory).getDeviceMemory(); + void* dst = gpuMem(dstMemory).getDeviceMemory(); + + // Detect the agents for memory allocations + const hsa_agent_t srcAgent = (srcMemory.isHostMemDirectAccess()) ? + dev().getCpuAgent() : dev().getBackendDevice(); + const hsa_agent_t dstAgent = (dstMemory.isHostMemDirectAccess()) ? + dev().getCpuAgent() : dev().getBackendDevice(); + + const hsa_signal_value_t kInitVal = size[2] * size[1]; + hsa_signal_store_relaxed(completion_signal_, kInitVal); + + for (size_t z = 0; z < size[2]; ++z) { + for (size_t y = 0; y < size[1]; ++y) { + size_t srcOffset = srcRect.offset(0, y, z); + size_t dstOffset = dstRect.offset(0, y, z); + + // Copy memory line by line + hsa_status_t status = hsa_amd_memory_async_copy( + (reinterpret_cast
(dst) + dstOffset), dstAgent, + (reinterpret_cast(src) + srcOffset), + srcAgent, size[0], 0, NULL, completion_signal_); + if (status != HSA_STATUS_SUCCESS) { + LogPrintfError("DMA buffer failed with code %d", status); + return false; + } + } + } + + hsa_signal_value_t val = + hsa_signal_wait_acquire(completion_signal_, HSA_SIGNAL_CONDITION_EQ, + 0, uint64_t(-1), HSA_WAIT_STATE_ACTIVE); + + if (val != 0) { + LogError("Async copy failed"); + return false; + } } - } - - hsa_signal_value_t val = - hsa_signal_wait_acquire(completion_signal_, HSA_SIGNAL_CONDITION_EQ, - 0, uint64_t(-1), HSA_WAIT_STATE_ACTIVE); - - if (val != 0) { - LogError("Async copy failed"); - return false; - } - - return true; + return true; } -bool HsaBlitManager::copyImageToBuffer(device::Memory& srcMemory, - device::Memory& dstMemory, - const amd::Coord3D& srcOrigin, - const amd::Coord3D& dstOrigin, - const amd::Coord3D& size, bool entire, - size_t rowPitch, - size_t slicePitch) const { - roc::Image& srcImage = (roc::Image&)srcMemory; - roc::Buffer& dstBuffer = (roc::Buffer&)dstMemory; +bool +DmaBlitManager::copyImageToBuffer( + device::Memory& srcMemory, + device::Memory& dstMemory, + const amd::Coord3D& srcOrigin, + const amd::Coord3D& dstOrigin, + const amd::Coord3D& size, + bool entire, + size_t rowPitch, + size_t slicePitch) const +{ + bool result = false; - return hsaCopyImageToBuffer(gpu().gpu_device(), srcImage.getHsaImageObject(), - dstBuffer.getDeviceMemory(), srcOrigin, dstOrigin, - size, entire, rowPitch, slicePitch); -} + if (setup_.disableCopyImageToBuffer_) { + result = HostBlitManager::copyImageToBuffer(srcMemory, dstMemory, + srcOrigin, dstOrigin, size, entire, rowPitch, slicePitch); + } + else { + Image& srcImage = static_cast(srcMemory); + Buffer& dstBuffer = static_cast(dstMemory); -bool HsaBlitManager::copyBufferToImage(device::Memory& srcMemory, - device::Memory& dstMemory, - const amd::Coord3D& srcOrigin, - const amd::Coord3D& dstOrigin, - const amd::Coord3D& size, bool entire, - size_t rowPitch, - size_t slicePitch) const { - roc::Buffer& srcBuffer = (roc::Buffer&)srcMemory; - roc::Image& dstImage = (roc::Image&)dstMemory; + // Use ROC path for a transfer + // Note: it doesn't support SDMA + address dstHost = reinterpret_cast
(dstBuffer.getDeviceMemory()) + + dstOrigin[0]; - return hsaCopyBufferToImage(gpu().gpu_device(), srcBuffer.getDeviceMemory(), - dstImage.getHsaImageObject(), srcOrigin, - dstOrigin, size, entire, rowPitch, slicePitch); -} + // Use ROCm path for a transfer. + // Note: it doesn't support SDMA + hsa_ext_image_region_t image_region; + image_region.offset.x = srcOrigin[0]; + image_region.offset.y = srcOrigin[1]; + image_region.offset.z = srcOrigin[2]; + image_region.range.x = size[0]; + image_region.range.y = size[1]; + image_region.range.z = size[2]; -bool HsaBlitManager::copyImage(device::Memory& srcMemory, - device::Memory& dstMemory, - const amd::Coord3D& srcOrigin, - const amd::Coord3D& dstOrigin, - const amd::Coord3D& size, bool entire) const { - if (srcMemory.isHostMemDirectAccess() && - dstMemory.isHostMemDirectAccess()) { - return device::HostBlitManager::copyImage( - srcMemory, dstMemory, srcOrigin, dstOrigin, size, entire); - } + hsa_status_t status = hsa_ext_image_export(gpu().gpu_device(), + srcImage.getHsaImageObject(), dstHost, rowPitch, + slicePitch, &image_region); + result = (status == HSA_STATUS_SUCCESS) ? true : false; - roc::Image *srcImage = (roc::Image *)&srcMemory; - roc::Image *dstImage = (roc::Image *)&dstMemory; - - hsa_dim3_t src_offset = { 0 }; - src_offset.x = srcOrigin[0]; - src_offset.y = srcOrigin[1]; - src_offset.z = srcOrigin[2]; - - hsa_dim3_t dst_offset = { 0 }; - dst_offset.x = dstOrigin[0]; - dst_offset.y = dstOrigin[1]; - dst_offset.z = dstOrigin[2]; - - hsa_dim3_t copy_size = { 0 }; - copy_size.x = size[0]; - copy_size.y = size[1]; - copy_size.z = size[2]; - - hsa_status_t status = hsa_ext_image_copy( - gpu().gpu_device(), srcImage->getHsaImageObject(), &src_offset, - dstImage->getHsaImageObject(), &dst_offset, ©_size); - return (status == HSA_STATUS_SUCCESS); -} - -bool HsaBlitManager::fillBuffer(device::Memory& memory, const void* pattern, - size_t patternSize, const amd::Coord3D& origin, - const amd::Coord3D& size, bool entire) const { - void* fillMem = static_cast(memory).getDeviceMemory(); - - size_t offset = origin[0]; - size_t fillSize = size[0]; - - if ((fillSize % patternSize) != 0) { - LogError("Misaligned buffer size and pattern size!"); - } - - // Fill the buffer memory with a pattern - for (size_t i = 0; i < (fillSize / patternSize); i++) { - void *dstDev = reinterpret_cast
(fillMem) + offset; - bool retval = hsaCopy(pattern, dstDev, patternSize, true); - if (!retval) { - LogError("DMA buffer failed with code"); - return retval; + // Check if a HostBlit transfer is required + if (completeOperation_ && !result) { + result = HostBlitManager::copyImageToBuffer(srcMemory, + dstMemory, srcOrigin, dstOrigin, size, entire, rowPitch, slicePitch); + } } - offset += patternSize; - } - - return true; + return result; } -bool HsaBlitManager::fillImage(device::Memory& memory, const void* pattern, - const amd::Coord3D& origin, - const amd::Coord3D& size, bool entire) const { - if (memory.isHostMemDirectAccess()) { - return device::HostBlitManager::fillImage(memory, pattern, origin, size, entire); - } +bool +DmaBlitManager::copyBufferToImage( + device::Memory& srcMemory, + device::Memory& dstMemory, + const amd::Coord3D& srcOrigin, + const amd::Coord3D& dstOrigin, + const amd::Coord3D& size, + bool entire, + size_t rowPitch, + size_t slicePitch) const +{ + bool result = false; - roc::Image *image = (roc::Image*)&memory; - hsa_ext_image_region_t image_region; - image_region.offset.x = origin[0]; - image_region.offset.y = origin[1]; - image_region.offset.z = origin[2]; - image_region.range.x = size[0]; - image_region.range.y = size[1]; - image_region.range.z = size[2]; + if (setup_.disableCopyBufferToImage_) { + result = HostBlitManager::copyBufferToImage(srcMemory, + dstMemory, srcOrigin, dstOrigin, size, entire, rowPitch, slicePitch); + } + else { + Buffer& srcBuffer = static_cast(srcMemory); + Image& dstImage = static_cast(dstMemory); - hsa_status_t status = hsa_ext_image_clear( - gpu().gpu_device(), image->getHsaImageObject(), - pattern, &image_region); - return (status == HSA_STATUS_SUCCESS); + // Use ROC path for a transfer + // Note: it doesn't support SDMA + address srcHost = reinterpret_cast
(srcBuffer.getDeviceMemory()) + + srcOrigin[0]; + + hsa_ext_image_region_t image_region; + image_region.offset.x = dstOrigin[0]; + image_region.offset.y = dstOrigin[1]; + image_region.offset.z = dstOrigin[2]; + image_region.range.x = size[0]; + image_region.range.y = size[1]; + image_region.range.z = size[2]; + + hsa_status_t status = hsa_ext_image_import(gpu().gpu_device(), + srcHost, rowPitch, slicePitch, dstImage.getHsaImageObject(), &image_region); + result = (status == HSA_STATUS_SUCCESS) ? true : false; + + // Check if a HostBlit tran sfer is required + if (completeOperation_ && !result) { + result = HostBlitManager::copyBufferToImage(srcMemory, + dstMemory, srcOrigin, dstOrigin, size, entire, rowPitch, slicePitch); + } + } + + return result; } -static void +bool +DmaBlitManager::copyImage( + device::Memory& srcMemory, + device::Memory& dstMemory, + const amd::Coord3D& srcOrigin, + const amd::Coord3D& dstOrigin, + const amd::Coord3D& size, + bool entire) const +{ + bool result = false; + + if (setup_.disableCopyImage_) { + return HostBlitManager::copyImage(srcMemory, dstMemory, + srcOrigin, dstOrigin, size, entire); + } + else { + //! @todo Add HW accelerated path + return HostBlitManager::copyImage(srcMemory, dstMemory, + srcOrigin, dstOrigin, size, entire); + } + + return result; +} + +bool DmaBlitManager::hsaCopy( + const Memory& srcMemory, + const Memory& dstMemory, + const amd::Coord3D& srcOrigin, + const amd::Coord3D& dstOrigin, + const amd::Coord3D& size, + bool enableCopyRect, + bool flushDMA) const +{ + address src = reinterpret_cast
(srcMemory.getDeviceMemory()); + address dst = reinterpret_cast
(dstMemory.getDeviceMemory()); + + src += srcOrigin[0]; + dst += dstOrigin[0]; + + // Just call copy function for full profile + hsa_status_t status; + if (dev().agent_profile() == HSA_PROFILE_FULL) { + status = hsa_memory_copy(dst, src, size[0]); + if (status != HSA_STATUS_SUCCESS) { + LogPrintfError("Hsa copy of data failed with code %d", status); + } + return (status == HSA_STATUS_SUCCESS); + } + + // Detect the agents for memory allocations + const hsa_agent_t srcAgent = (srcMemory.isHostMemDirectAccess()) ? + dev().getCpuAgent() : dev().getBackendDevice(); + const hsa_agent_t dstAgent = (dstMemory.isHostMemDirectAccess()) ? + dev().getCpuAgent() : dev().getBackendDevice(); + + const hsa_signal_value_t kInitVal = 1; + hsa_signal_store_relaxed(completion_signal_, kInitVal); + + // Use SDMA to transfer the data + status = hsa_amd_memory_async_copy(dst, dstAgent, src, srcAgent, + size[0], 0, nullptr, completion_signal_); + if (status == HSA_STATUS_SUCCESS) { + hsa_signal_value_t val = hsa_signal_wait_acquire( + completion_signal_, HSA_SIGNAL_CONDITION_EQ, 0, + uint64_t(-1), HSA_WAIT_STATE_ACTIVE); + if (val != (kInitVal - 1)) { + LogError("Async copy failed"); + status = HSA_STATUS_ERROR; + } + } + else { + LogPrintfError("Hsa copy from host to device failed with code %d", status); + } + + return (status == HSA_STATUS_SUCCESS); +} + +bool DmaBlitManager::hsaCopyStaged( + const_address hostSrc, address hostDst, size_t size, address staging, bool hostToDev) const +{ + // No allocation is necessary for Full Profile + hsa_status_t status; + if (dev().agent_profile() == HSA_PROFILE_FULL) { + status = hsa_memory_copy(hostDst, hostSrc, size); + if (status != HSA_STATUS_SUCCESS) { + LogPrintfError("Hsa copy of data failed with code %d", status); + } + return (status == HSA_STATUS_SUCCESS); + } + + size_t totalSize = size; + size_t offset = 0; + + address hsaBuffer = staging; + + const hsa_signal_value_t kInitVal = 1; + + // Allocate requested size of memory + while (totalSize > 0) { + size = std::min(totalSize, dev().settings().stagedXferSize_); + hsa_signal_store_relaxed(completion_signal_, kInitVal); + + // Copy data from Host to Device + if (hostToDev) { + memcpy(hsaBuffer, hostSrc + offset, size); + status = hsa_amd_memory_async_copy( + hostDst + offset, dev().getBackendDevice(), hsaBuffer, + dev().getCpuAgent(), size, 0, NULL, completion_signal_); + if (status == HSA_STATUS_SUCCESS) { + hsa_signal_value_t val = + hsa_signal_wait_acquire(completion_signal_, + HSA_SIGNAL_CONDITION_EQ, 0, + uint64_t(-1), HSA_WAIT_STATE_ACTIVE); + + if (val != (kInitVal - 1)) { + LogError("Async copy failed"); + return false; + } + } + else { + LogPrintfError("Hsa copy from host to device failed with code %d", status); + return false; + } + totalSize -= size; + offset += size; + continue; + } + + // Copy data from Device to Host + status = hsa_amd_memory_async_copy(hsaBuffer, + dev().getCpuAgent(), hostSrc + offset, dev().getBackendDevice(), + size, 0, NULL, completion_signal_); + if (status == HSA_STATUS_SUCCESS) { + hsa_signal_value_t val = hsa_signal_wait_acquire( + completion_signal_, HSA_SIGNAL_CONDITION_EQ, 0, uint64_t(-1), + HSA_WAIT_STATE_ACTIVE); + + if (val != (kInitVal - 1)) { + LogError("Async copy failed"); + return false; + } + memcpy(hostDst + offset, hsaBuffer, size); + } + else { + LogPrintfError("Hsa copy from device to host failed with code %d", status); + return false; + } + totalSize -= size; + offset += size; + } + + return true; +} + +KernelBlitManager::KernelBlitManager( + VirtualGPU& gpu, Setup setup) + : DmaBlitManager(gpu, setup) + , program_(NULL) + , constantBuffer_(NULL) + , xferBufferSize_(0) + , lockXferOps_(NULL) +{ + for (uint i = 0; i < BlitTotal; ++i) { + kernels_[i] = NULL; + } + + for (uint i = 0; i < MaxXferBuffers; ++i) { + xferBuffers_[i] = NULL; + } + + completeOperation_ = false; +} + +KernelBlitManager::~KernelBlitManager() +{ + for (uint i = 0; i < BlitTotal; ++i) { + if (NULL != kernels_[i]) { + kernels_[i]->release(); + } + } + if (NULL != program_) { + program_->release(); + } + + if (NULL != context_) { + // Release a dummy context + context_->release(); + } + + if (NULL != constantBuffer_) { + constantBuffer_->release(); + } + + for (uint i = 0; i < MaxXferBuffers; ++i) { + if (NULL != xferBuffers_[i]) { + xferBuffers_[i]->release(); + } + } + + delete lockXferOps_; +} + +bool +KernelBlitManager::create(amd::Device& device) +{ + if (!DmaBlitManager::create(device)) { + return false; + } + + if (!createProgram(static_cast(device))) { + return false; + } + return true; +} + +bool +KernelBlitManager::createProgram(Device& device) +{ + if (device.blitProgram() == nullptr) { + return false; + } + + std::vector devices; + devices.push_back(&device); + + // Save context and program for this device + context_ = device.blitProgram()->context_; + context_->retain(); + program_ = device.blitProgram()->program_; + program_->retain(); + + bool result = false; + do { + // Create kernel objects for all blits + for (uint i = 0; i < BlitTotal; ++i) { + const amd::Symbol* symbol = program_->findSymbol(BlitName[i]); + if (symbol == NULL) { + break; + } + kernels_[i] = new amd::Kernel(*program_, *symbol, BlitName[i]); + if (kernels_[i] == NULL) { + break; + } + // Validate blit kernels for the scratch memory usage (pre SI) + if (!device.validateKernel(*kernels_[i], &gpu())) { + break; + } + } + + result = true; + } while(!result); + + // Create an internal constant buffer + constantBuffer_ = new (*context_) + amd::Buffer(*context_, CL_MEM_ALLOC_HOST_PTR, 4 * Ki); + + if ((constantBuffer_ != NULL) && !constantBuffer_->create(NULL)) { + constantBuffer_->release(); + constantBuffer_ = NULL; + return false; + } + else if (constantBuffer_ == NULL) { + return false; + } + + // Assign the constant buffer to the current virtual GPU + constantBuffer_->setVirtualDevice(&gpu()); + + if (dev().settings().xferBufSize_ > 0) { + xferBufferSize_ = dev().settings().xferBufSize_; + for (uint i = 0; i < MaxXferBuffers; ++i) { + // Create internal xfer buffers for image copy optimization + xferBuffers_[i] = new (*context_) + amd::Buffer(*context_, 0, xferBufferSize_); + + if ((xferBuffers_[i] != NULL) && !xferBuffers_[i]->create(NULL)) { + xferBuffers_[i]->release(); + xferBuffers_[i] = NULL; + return false; + } + else if (xferBuffers_[i] == NULL) { + return false; + } + + // Assign the xfer buffer to the current virtual GPU + xferBuffers_[i]->setVirtualDevice(&gpu()); + //! @note Workaround for conformance allocation test. + //! Force GPU mem alloc. + //! Unaligned images require xfer optimization, + //! but deferred memory allocation can cause + //! virtual heap fragmentation for big allocations and + //! then fail the following test with 32 bit ISA, because + //! runtime runs out of 4GB space. + dev().getRocMemory(xferBuffers_[i]); + } + } + + lockXferOps_ = new amd::Monitor("Transfer Ops Lock", true); + if (NULL == lockXferOps_) { + return false; + } + + return result; +} + +// The following data structures will be used for the view creations. +// Some formats has to be converted before a kernel blit operation +struct FormatConvertion { + cl_uint clOldType_; + cl_uint clNewType_; +}; + +// The list of rejected data formats and corresponding conversion +static const FormatConvertion RejectedData[] = +{ + { CL_UNORM_INT8, CL_UNSIGNED_INT8 }, + { CL_UNORM_INT16, CL_UNSIGNED_INT16 }, + { CL_SNORM_INT8, CL_UNSIGNED_INT8 }, + { CL_SNORM_INT16, CL_UNSIGNED_INT16 }, + { CL_HALF_FLOAT, CL_UNSIGNED_INT16 }, + { CL_FLOAT, CL_UNSIGNED_INT32 }, + { CL_SIGNED_INT8, CL_UNSIGNED_INT8 }, + { CL_SIGNED_INT16, CL_UNSIGNED_INT16 }, + { CL_UNORM_INT_101010, CL_UNSIGNED_INT8 }, + { CL_SIGNED_INT32, CL_UNSIGNED_INT32 } +}; + +// The list of rejected channel's order and corresponding conversion +static const FormatConvertion RejectedOrder[] = +{ + { CL_A, CL_R }, + { CL_RA, CL_RG }, + { CL_LUMINANCE, CL_R }, + { CL_INTENSITY, CL_R }, + { CL_RGB, CL_RGBA }, + { CL_BGRA, CL_RGBA }, + { CL_ARGB, CL_RGBA }, + { CL_sRGB, CL_RGBA }, + { CL_sRGBx, CL_RGBA }, + { CL_sRGBA, CL_RGBA }, + { CL_sBGRA, CL_RGBA }, + { CL_DEPTH, CL_R } +}; + +const uint RejectedFormatDataTotal = + sizeof(RejectedData) / sizeof(FormatConvertion); +const uint RejectedFormatChannelTotal = + sizeof(RejectedOrder) / sizeof(FormatConvertion); + +bool +KernelBlitManager::copyBufferToImage( + device::Memory& srcMemory, + device::Memory& dstMemory, + const amd::Coord3D& srcOrigin, + const amd::Coord3D& dstOrigin, + const amd::Coord3D& size, + bool entire, + size_t rowPitch, + size_t slicePitch) const +{ + amd::ScopedLock k(lockXferOps_); + bool result = false; + static const bool CopyRect = false; + // Flush DMA for ASYNC copy + static const bool FlushDMA = true; + size_t imgRowPitch = size[0] * gpuMem(dstMemory).owner()->asImage()->getImageFormat().getElementSize(); + size_t imgSlicePitch = imgRowPitch * size[1]; + + if (setup_.disableCopyBufferToImage_) { + result = DmaBlitManager::copyBufferToImage( + srcMemory, dstMemory, srcOrigin, dstOrigin, size, + entire, rowPitch, slicePitch); + synchronize(); + return result; + } + // Check if buffer is in system memory with direct access + else if (gpuMem(srcMemory).isHostMemDirectAccess() && + (((rowPitch == 0) && (slicePitch == 0)) || + ((rowPitch == imgRowPitch) && + ((slicePitch == 0) || (slicePitch == imgSlicePitch))))) { + // First attempt to do this all with DMA, + // but there are restriciton with older hardware + if (dev().settings().imageDMA_) { + result = DmaBlitManager::copyBufferToImage( + srcMemory, dstMemory, srcOrigin, dstOrigin, size, + entire, rowPitch, slicePitch); + if (result) { + synchronize(); + return result; + } + } + } + + if (!result) { + result = copyBufferToImageKernel(srcMemory, + dstMemory, srcOrigin, dstOrigin, size, entire, rowPitch, slicePitch); + } + + synchronize(); + + return result; +} + +void CalcRowSlicePitches( cl_ulong* pitch, const cl_int* copySize, size_t rowPitch, size_t slicePitch, const Memory& mem) { - const roc::Image &hsaImage = static_cast< const roc::Image &>(mem); - bool img1Darray = - (mem.owner()->getType() == CL_MEM_OBJECT_IMAGE1D_ARRAY) ? true : false; - size_t memFmtSize = mem.owner()->asImage()->getImageFormat().getElementSize(); + uint32_t memFmtSize = mem.owner()->asImage()->getImageFormat().getElementSize(); + bool img1Darray = (mem.owner()->getType() == CL_MEM_OBJECT_IMAGE1D_ARRAY) ? true : false; if (rowPitch == 0) { pitch[0] = copySize[0]; @@ -544,265 +1012,546 @@ CalcRowSlicePitches( } } -KernelBlitManager::KernelBlitManager(device::VirtualDevice& vDev, Setup setup) - : HsaBlitManager(vDev, setup), - context_(NULL), - program_(NULL) +static void +setArgument(amd::Kernel* kernel, size_t index, size_t size, const void* value) { - for (uint i = 0; i < BlitTotal; ++i) { - kernels_[i] = NULL; - } -} - -KernelBlitManager::~KernelBlitManager() -{ - for (uint i = 0; i < BlitTotal; ++i) { - if (NULL != kernels_[i]) { - kernels_[i]->release(); - } - } - - if (NULL != program_) { - program_->release(); - } - - if (NULL != context_) { - // Release a dummy context - context_->release(); - } + kernel->parameters().set(index, size, value); } bool -KernelBlitManager::readBuffer( - device::Memory& srcMemory, - void* dstHost, - const amd::Coord3D& origin, +KernelBlitManager::copyBufferToImageKernel( + device::Memory& srcMemory, + device::Memory& dstMemory, + const amd::Coord3D& srcOrigin, + const amd::Coord3D& dstOrigin, const amd::Coord3D& size, - bool entire) const + bool entire, + size_t rowPitch, + size_t slicePitch) const { - //if (setup_.disableReadBuffer_ || srcMemory.isHostMemDirectAccess()) { - // return device::HostBlitManager::readBuffer(srcMemory, dstHost, origin, - // size, entire); - //} - // Exercise HSA path for now. - return HsaBlitManager::readBuffer(srcMemory, dstHost, origin, - size, entire); - - amd::Buffer *dstMemory = new (*context_) amd::Buffer( - *context_, CL_MEM_USE_HOST_PTR, size[0]); - - if (!dstMemory->create(const_cast(dstHost))) { - LogError("[OCL] Fail to create mem object for destination"); - return false; - } - - device::Memory *devDstMemory = dstMemory->getDeviceMemory(dev_); - if (devDstMemory== NULL) { - LogError("[OCL] Fail to create device mem object for destination"); - return false; - } - - bool result = copyBuffer( - srcMemory, *devDstMemory, origin, amd::Coord3D(0), size, entire); - - // Wait for the transfer to finish so that we could safely release the - // destination memory object. - // TODO: we could remove this if issue on implicit memory registration is - // fixed by KFD, so that we could pass the pattern as SVM. - gpu().releaseGpuMemoryFence(); - - dstMemory->release(); - - return result; -} - -bool -KernelBlitManager::readBufferRect( - device::Memory& srcMemory, - void* dstHost, - const amd::BufferRect& bufRect, - const amd::BufferRect& hostRect, - const amd::Coord3D& size, - bool entire) const -{ - // if (setup_.disableReadBufferRect_ || srcMemory.isHostMemDirectAccess()) { - //return device::HostBlitManager::readBufferRect( - // srcMemory, dstHost, bufRect, hostRect, size, entire); - // } - - // Exercise HSA path for now. - return HsaBlitManager::readBufferRect( - srcMemory, dstHost, bufRect, hostRect, size, entire); - - size_t dstSize = hostRect.start_ + hostRect.end_; - amd::Buffer *dstMemory = - new (*context_) amd::Buffer(*context_, CL_MEM_USE_HOST_PTR, dstSize); - - if (!dstMemory->create(const_cast(dstHost))) { - LogError("[OCL] Fail to create mem object for destination"); - return false; - } - - device::Memory *devDstMemory = dstMemory->getDeviceMemory(dev_); - if (devDstMemory== NULL) { - LogError("[OCL] Fail to create device mem object for destination"); - return false; - } - - bool result = copyBufferRect( - srcMemory, *devDstMemory, bufRect, hostRect, size, entire); - - // Wait for the transfer to finish so that we could safely release the - // destination memory object. - // TODO: we could remove this if issue on implicit memory registration is - // fixed by KFD, so that we could pass the pattern as SVM. - gpu().releaseGpuMemoryFence(); - - dstMemory->release(); - - return result; -} - -void -FindLinearSize( - size_t& linearSize, const amd::Coord3D& size, - size_t& rowPitch, size_t& slicePitch, const device::Memory& mem) -{ - const roc::Image &image = static_cast(mem); - size_t elementSize = mem.owner()->asImage()->getImageFormat().getElementSize(); - - linearSize = size[0] * elementSize; - if ((rowPitch == 0) || (rowPitch == linearSize)) { - rowPitch = 0; - } - else { - linearSize = rowPitch; - } - - // Calculate the pin size, which should be equal to the copy size - for (uint i = 1; i < mem.owner()->asImage()->getDims(); ++i) { - linearSize *= size[i]; - if (i == 1) { - if ((slicePitch == 0) || (slicePitch == linearSize)) { - slicePitch = 0; - } - else { - if (mem.owner()->getType() != CL_MEM_OBJECT_IMAGE1D_ARRAY) { - linearSize = slicePitch; - } - else { - linearSize = slicePitch * size[i]; - } - } - } - } -} - -// The following data structures will be used for the view creations. -// Some formats has to be converted before a kernel blit operation -struct FormatConvertion { - cl_uint clOldType_; - cl_uint clNewType_; -}; - -// The list of rejected data formats and corresponding conversion -static const FormatConvertion RejectedData[] = -{ - { CL_UNORM_INT8, CL_UNSIGNED_INT8 }, - { CL_UNORM_INT16, CL_UNSIGNED_INT16 }, - { CL_SNORM_INT8, CL_UNSIGNED_INT8 }, - { CL_SNORM_INT16, CL_UNSIGNED_INT16 }, - { CL_HALF_FLOAT, CL_UNSIGNED_INT16 }, - { CL_FLOAT, CL_UNSIGNED_INT32 }, - { CL_SIGNED_INT8, CL_UNSIGNED_INT8 }, - { CL_SIGNED_INT16, CL_UNSIGNED_INT16 }, - { CL_UNORM_INT_101010, CL_UNSIGNED_INT8 }, - { CL_SIGNED_INT32, CL_UNSIGNED_INT32 } -}; - -// The list of rejected channel's order and corresponding conversion -static const FormatConvertion RejectedOrder[] = -{ - { CL_A, CL_R }, - { CL_RA, CL_RG }, - { CL_LUMINANCE, CL_R }, - { CL_INTENSITY, CL_R }, - { CL_RGB, CL_RGBA }, - { CL_BGRA, CL_RGBA }, - { CL_ARGB, CL_RGBA }, - { CL_sRGB, CL_RGBA }, - { CL_sRGBx, CL_RGBA }, - { CL_sRGBA, CL_RGBA }, - { CL_sBGRA, CL_RGBA }, - { CL_DEPTH, CL_R} -}; - -const uint RejectedFormatDataTotal = - sizeof(RejectedData) / sizeof(FormatConvertion); -const uint RejectedFormatChannelTotal = - sizeof(RejectedOrder) / sizeof(FormatConvertion); - -amd::Image::Format -KernelBlitManager::filterFormat(amd::Image::Format oldFormat) const -{ - cl_image_format newFormat; - newFormat.image_channel_data_type = oldFormat.image_channel_data_type; - newFormat.image_channel_order = oldFormat.image_channel_order; + bool rejected = false; + Memory* dstView = &gpuMem(dstMemory); + bool releaseView = false; + bool result = false; + amd::Image::Format newFormat(gpuMem(dstMemory).owner()->asImage()->getImageFormat()); // Find unsupported formats for (uint i = 0; i < RejectedFormatDataTotal; ++i) { - if (RejectedData[i].clOldType_ == oldFormat.image_channel_data_type) { + if (RejectedData[i].clOldType_ == newFormat.image_channel_data_type) { newFormat.image_channel_data_type = RejectedData[i].clNewType_; + rejected = true; break; } } // Find unsupported channel's order for (uint i = 0; i < RejectedFormatChannelTotal; ++i) { - if (RejectedOrder[i].clOldType_ == oldFormat.image_channel_order) { + if (RejectedOrder[i].clOldType_ == newFormat.image_channel_order) { newFormat.image_channel_order = RejectedOrder[i].clNewType_; + rejected = true; break; } } - return amd::Image::Format(newFormat); + // If the image format was rejected, then attempt to create a view + if (rejected && + // todo ROC runtime has a problem with a view for this format + (gpuMem(dstMemory).owner()->asImage()-> + getImageFormat().image_channel_data_type != CL_UNORM_INT_101010)) { + dstView = createView(gpuMem(dstMemory), newFormat); + if (dstView != NULL) { + rejected = false; + releaseView = true; + } + } + + // Fall into the host path if the image format was rejected + if (rejected) { + return DmaBlitManager::copyBufferToImage( + srcMemory, dstMemory, srcOrigin, dstOrigin, size, entire); + } + + // Use a common blit type with three dimensions by default + uint blitType = BlitCopyBufferToImage; + size_t dim = 0; + size_t globalWorkOffset[3] = { 0, 0, 0 }; + size_t globalWorkSize[3]; + size_t localWorkSize[3]; + + // Program the kernels workload depending on the blit dimensions + dim = 3; + if (dstMemory.owner()->asImage()->getDims() == 1) { + globalWorkSize[0] = amd::alignUp(size[0], 256); + globalWorkSize[1] = amd::alignUp(size[1], 1); + globalWorkSize[2] = amd::alignUp(size[2], 1); + localWorkSize[0] = 256; + localWorkSize[1] = localWorkSize[2] = 1; + } + else if (dstMemory.owner()->asImage()->getDims() == 2) { + globalWorkSize[0] = amd::alignUp(size[0], 16); + globalWorkSize[1] = amd::alignUp(size[1], 16); + globalWorkSize[2] = amd::alignUp(size[2], 1); + localWorkSize[0] = localWorkSize[1] = 16; + localWorkSize[2] = 1; + } + else { + globalWorkSize[0] = amd::alignUp(size[0], 8); + globalWorkSize[1] = amd::alignUp(size[1], 8); + globalWorkSize[2] = amd::alignUp(size[2], 4); + localWorkSize[0] = localWorkSize[1] = 8; + localWorkSize[2] = 4; + } + + // Program kernels arguments for the blit operation + cl_mem mem = as_cl(srcMemory.owner()); + setArgument(kernels_[blitType], 0, sizeof(cl_mem), &mem); + mem = as_cl(dstView->owner()); + setArgument(kernels_[blitType], 1, sizeof(cl_mem), &mem); + uint32_t memFmtSize = dstMemory.owner()->asImage()->getImageFormat().getElementSize(); + uint32_t components = dstMemory.owner()->asImage()->getImageFormat().getNumChannels(); + + // 1 element granularity for writes by default + cl_int granularity = 1; + if (memFmtSize == 2) { + granularity = 2; + } + else if (memFmtSize >= 4) { + granularity = 4; + } + CondLog(((srcOrigin[0] % granularity) != 0), "Unaligned offset in blit!"); + cl_ulong srcOrg[4] = { srcOrigin[0] / granularity, + srcOrigin[1], + srcOrigin[2], 0 }; + setArgument(kernels_[blitType], 2, sizeof(srcOrg), srcOrg); + + cl_int dstOrg[4] = { (cl_int)dstOrigin[0], + (cl_int)dstOrigin[1], + (cl_int)dstOrigin[2], 0 }; + cl_int copySize[4] = { (cl_int)size[0], + (cl_int)size[1], + (cl_int)size[2], 0 }; + + setArgument(kernels_[blitType], 3, sizeof(dstOrg), dstOrg); + setArgument(kernels_[blitType], 4, sizeof(copySize), copySize); + + // Program memory format + uint multiplier = memFmtSize / sizeof(uint32_t); + multiplier = (multiplier == 0) ? 1 : multiplier; + cl_uint format[4] = { components, + memFmtSize / components, + multiplier, 0 }; + setArgument(kernels_[blitType], 5, sizeof(format), format); + + // Program row and slice pitches + cl_ulong pitch[4] = { 0 }; + CalcRowSlicePitches(pitch, copySize, rowPitch, slicePitch, gpuMem(dstMemory)); + setArgument(kernels_[blitType], 6, sizeof(pitch), pitch); + + // Create ND range object for the kernel's execution + amd::NDRangeContainer ndrange(dim, + globalWorkOffset, globalWorkSize, localWorkSize); + + // Execute the blit + address parameters = kernels_[blitType]->parameters().capture(dev()); + result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], parameters, NULL); + kernels_[blitType]->parameters().release(const_cast
(parameters), dev()); + + if (releaseView) { + // todo SRD programming could be changed to avoid a stall + gpu().releaseGpuMemoryFence(); + dstView->owner()->release(); + } + + return result; } -device::Memory * -KernelBlitManager::createImageView( - device::Memory &parent, - amd::Image::Format newFormat) const +bool +KernelBlitManager::copyImageToBuffer( + device::Memory& srcMemory, + device::Memory& dstMemory, + const amd::Coord3D& srcOrigin, + const amd::Coord3D& dstOrigin, + const amd::Coord3D& size, + bool entire, + size_t rowPitch, + size_t slicePitch) const { - amd::Image *image = - parent.owner()->asImage()->createView(parent.owner()->getContext(), newFormat, &gpu()); + amd::ScopedLock k(lockXferOps_); + bool result = false; + static const bool CopyRect = false; + // Flush DMA for ASYNC copy + static const bool FlushDMA = true; + size_t imgRowPitch = size[0] * gpuMem(srcMemory).owner()->asImage()->getImageFormat().getElementSize(); + size_t imgSlicePitch = imgRowPitch * size[1]; - if (image == NULL) { - LogError("[OCL] Fail to allocate view of image object"); - return NULL; + if (setup_.disableCopyImageToBuffer_) { + result = HostBlitManager::copyImageToBuffer( + srcMemory, dstMemory, srcOrigin, dstOrigin, + size, entire, rowPitch, slicePitch); + synchronize(); + return result; + } + // Check if buffer is in system memory with direct access + else if (gpuMem(dstMemory).isHostMemDirectAccess() && + (((rowPitch == 0) && (slicePitch == 0)) || + ((rowPitch == imgRowPitch) && + ((slicePitch == 0) || (slicePitch == imgSlicePitch))))) { + // First attempt to do this all with DMA, + // but there are restriciton with older hardware + // If the dest buffer is external physical(SDI), copy two step as + // single step SDMA is causing corruption and the cause is under investigation + if (dev().settings().imageDMA_) { + result = DmaBlitManager::copyImageToBuffer( + srcMemory, dstMemory, srcOrigin, dstOrigin, + size, entire, rowPitch, slicePitch); + if (result) { + synchronize(); + return result; + } + } } - Image* devImage = new roc::Image(static_cast(dev_), *image); - if (devImage == NULL) { - LogError("[OCL] Fail to allocate device mem object for the view"); - image->release(); - return NULL; + if (!result) { + result = copyImageToBufferKernel(srcMemory, + dstMemory, srcOrigin, dstOrigin, size, entire, rowPitch, slicePitch); } - if (!devImage->createView(static_cast(parent))) { - LogError("[OCL] Fail to create device mem object for the view"); - delete devImage; - image->release(); - return NULL; + synchronize(); + + return result; +} + +bool +KernelBlitManager::copyImageToBufferKernel( + device::Memory& srcMemory, + device::Memory& dstMemory, + const amd::Coord3D& srcOrigin, + const amd::Coord3D& dstOrigin, + const amd::Coord3D& size, + bool entire, + size_t rowPitch, + size_t slicePitch) const +{ + bool rejected = false; + Memory* srcView = &gpuMem(srcMemory); + bool releaseView = false; + bool result = false; + amd::Image::Format newFormat(gpuMem(srcMemory).owner()->asImage()->getImageFormat()); + + // Find unsupported formats + for (uint i = 0; i < RejectedFormatDataTotal; ++i) { + if (RejectedData[i].clOldType_ == newFormat.image_channel_data_type) { + newFormat.image_channel_data_type = RejectedData[i].clNewType_; + rejected = true; + break; + } } - image->replaceDeviceMemory(&dev_, devImage); + // Find unsupported channel's order + for (uint i = 0; i < RejectedFormatChannelTotal; ++i) { + if (RejectedOrder[i].clOldType_ == newFormat.image_channel_order) { + newFormat.image_channel_order = RejectedOrder[i].clNewType_; + rejected = true; + break; + } + } - return devImage; + // If the image format was rejected, then attempt to create a view + if (rejected && + // todo ROC runtime has a problem with a view for this format + (gpuMem(srcMemory).owner()->asImage()-> + getImageFormat().image_channel_data_type != CL_UNORM_INT_101010)) { + srcView = createView(gpuMem(srcMemory), newFormat); + if (srcView != NULL) { + rejected = false; + releaseView = true; + } + } + + // Fall into the host path if the image format was rejected + if (rejected) { + return DmaBlitManager::copyImageToBuffer( + srcMemory, dstMemory, srcOrigin, dstOrigin, size, entire); + } + + uint blitType = BlitCopyImageToBuffer; + size_t dim = 0; + size_t globalWorkOffset[3] = { 0, 0, 0 }; + size_t globalWorkSize[3]; + size_t localWorkSize[3]; + + // Program the kernels workload depending on the blit dimensions + dim = 3; + // Find the current blit type + if (srcMemory.owner()->asImage()->getDims() == 1) { + globalWorkSize[0] = amd::alignUp(size[0], 256); + globalWorkSize[1] = amd::alignUp(size[1], 1); + globalWorkSize[2] = amd::alignUp(size[2], 1); + localWorkSize[0] = 256; + localWorkSize[1] = localWorkSize[2] = 1; + } + else if (srcMemory.owner()->asImage()->getDims() == 2) { + globalWorkSize[0] = amd::alignUp(size[0], 16); + globalWorkSize[1] = amd::alignUp(size[1], 16); + globalWorkSize[2] = amd::alignUp(size[2], 1); + localWorkSize[0] = localWorkSize[1] = 16; + localWorkSize[2] = 1; + } + else { + globalWorkSize[0] = amd::alignUp(size[0], 8); + globalWorkSize[1] = amd::alignUp(size[1], 8); + globalWorkSize[2] = amd::alignUp(size[2], 4); + localWorkSize[0] = localWorkSize[1] = 8; + localWorkSize[2] = 4; + } + + // Program kernels arguments for the blit operation + cl_mem mem = as_cl(srcView->owner()); + setArgument(kernels_[blitType], 0, sizeof(cl_mem), &mem); + mem = as_cl(dstMemory.owner()); + setArgument(kernels_[blitType], 1, sizeof(cl_mem), &mem); + + // Update extra paramters for USHORT and UBYTE pointers. + // Only then compiler can optimize the kernel to use + // UAV Raw for other writes + setArgument(kernels_[blitType], 2, sizeof(cl_mem), &mem); + setArgument(kernels_[blitType], 3, sizeof(cl_mem), &mem); + + cl_int srcOrg[4] = { (cl_int)srcOrigin[0], + (cl_int)srcOrigin[1], + (cl_int)srcOrigin[2], 0 }; + cl_int copySize[4] = { (cl_int)size[0], + (cl_int)size[1], + (cl_int)size[2], 0 }; + setArgument(kernels_[blitType], 4, sizeof(srcOrg), srcOrg); + uint32_t memFmtSize = srcMemory.owner()->asImage()->getImageFormat().getElementSize(); + uint32_t components = srcMemory.owner()->asImage()->getImageFormat().getNumChannels(); + + // 1 element granularity for writes by default + cl_int granularity = 1; + if (memFmtSize == 2) { + granularity = 2; + } + else if (memFmtSize >= 4) { + granularity = 4; + } + CondLog(((dstOrigin[0] % granularity) != 0), "Unaligned offset in blit!"); + cl_ulong dstOrg[4] = { dstOrigin[0] / granularity, + dstOrigin[1], + dstOrigin[2], 0 }; + setArgument(kernels_[blitType], 5, sizeof(dstOrg), dstOrg); + setArgument(kernels_[blitType], 6, sizeof(copySize), copySize); + + // Program memory format + uint multiplier = memFmtSize / sizeof(uint32_t); + multiplier = (multiplier == 0) ? 1 : multiplier; + cl_uint format[4] = { components, + memFmtSize / components, + multiplier, 0 }; + setArgument(kernels_[blitType], 7, sizeof(format), format); + + // Program row and slice pitches + cl_ulong pitch[4] = { 0 }; + CalcRowSlicePitches(pitch, copySize, rowPitch, slicePitch, gpuMem(srcMemory)); + setArgument(kernels_[blitType], 8, sizeof(pitch), pitch); + + // Create ND range object for the kernel's execution + amd::NDRangeContainer ndrange(dim, + globalWorkOffset, globalWorkSize, localWorkSize); + + // Execute the blit + address parameters = kernels_[blitType]->parameters().capture(dev()); + result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], parameters, NULL); + kernels_[blitType]->parameters().release(const_cast
(parameters), dev()); + if (releaseView) { + // todo SRD programming could be changed to avoid a stall + gpu().releaseGpuMemoryFence(); + srcView->owner()->release(); + } + + return result; +} + +bool +KernelBlitManager::copyImage( + device::Memory& srcMemory, + device::Memory& dstMemory, + const amd::Coord3D& srcOrigin, + const amd::Coord3D& dstOrigin, + const amd::Coord3D& size, + bool entire) const +{ + amd::ScopedLock k(lockXferOps_); + bool rejected = false; + Memory* srcView = &gpuMem(srcMemory); + Memory* dstView = &gpuMem(dstMemory); + bool releaseView = false; + bool result = false; + amd::Image::Format newFormat(gpuMem(srcMemory).owner()->asImage()->getImageFormat()); + + // Find unsupported formats + for (uint i = 0; i < RejectedFormatDataTotal; ++i) { + if (RejectedData[i].clOldType_ == newFormat.image_channel_data_type) { + newFormat.image_channel_data_type = RejectedData[i].clNewType_; + rejected = true; + break; + } + } + + // Search for the rejected channel's order only if the format was rejected + // Note: Image blit is independent from the channel order + if (rejected) { + for (uint i = 0; i < RejectedFormatChannelTotal; ++i) { + if (RejectedOrder[i].clOldType_ == newFormat.image_channel_order) { + newFormat.image_channel_order = RejectedOrder[i].clNewType_; + rejected = true; + break; + } + } + } + + // Attempt to create a view if the format was rejected + if (rejected) { + srcView = createView(gpuMem(srcMemory), newFormat); + if (srcView != NULL) { + dstView = createView(gpuMem(dstMemory), newFormat); + if (dstView != NULL) { + rejected = false; + releaseView = true; + } + else { + delete srcView; + } + } + } + + // Fall into the host path for the entire 2D copy or + // if the image format was rejected + if (rejected) { + result = HostBlitManager::copyImage(srcMemory, dstMemory, + srcOrigin, dstOrigin, size, entire); + synchronize(); + return result; + } + + uint blitType = BlitCopyImage; + size_t dim = 0; + size_t globalWorkOffset[3] = { 0, 0, 0 }; + size_t globalWorkSize[3]; + size_t localWorkSize[3]; + + // Program the kernels workload depending on the blit dimensions + dim = 3; + // Find the current blit type + if ((srcMemory.owner()->asImage()->getDims() == 1) || + (dstMemory.owner()->asImage()->getDims() == 1)) { + globalWorkSize[0] = amd::alignUp(size[0], 256); + globalWorkSize[1] = amd::alignUp(size[1], 1); + globalWorkSize[2] = amd::alignUp(size[2], 1); + localWorkSize[0] = 256; + localWorkSize[1] = localWorkSize[2] = 1; + } + else if ((srcMemory.owner()->asImage()->getDims() == 2) || + (dstMemory.owner()->asImage()->getDims() == 2)) { + globalWorkSize[0] = amd::alignUp(size[0], 16); + globalWorkSize[1] = amd::alignUp(size[1], 16); + globalWorkSize[2] = amd::alignUp(size[2], 1); + localWorkSize[0] = localWorkSize[1] = 16; + localWorkSize[2] = 1; + } + else { + globalWorkSize[0] = amd::alignUp(size[0], 8); + globalWorkSize[1] = amd::alignUp(size[1], 8); + globalWorkSize[2] = amd::alignUp(size[2], 4); + localWorkSize[0] = localWorkSize[1] = 8; + localWorkSize[2] = 4; + } + + // The current OpenCL spec allows "copy images from a 1D image + // array object to a 1D image array object" only. + if ((gpuMem(srcMemory).owner()->getType() == CL_MEM_OBJECT_IMAGE1D_ARRAY) || + (gpuMem(dstMemory).owner()->getType() == CL_MEM_OBJECT_IMAGE1D_ARRAY)) { + blitType = BlitCopyImage1DA; + } + + // Program kernels arguments for the blit operation + cl_mem mem = as_cl(srcView->owner()); + setArgument(kernels_[blitType], 0, sizeof(cl_mem), &mem); + mem = as_cl(dstView->owner()); + setArgument(kernels_[blitType], 1, sizeof(cl_mem), &mem); + + // Program source origin + cl_int srcOrg[4] = { (cl_int)srcOrigin[0], + (cl_int)srcOrigin[1], + (cl_int)srcOrigin[2], 0 }; + setArgument(kernels_[blitType], 2, sizeof(srcOrg), srcOrg); + + // Program destinaiton origin + cl_int dstOrg[4] = { (cl_int)dstOrigin[0], + (cl_int)dstOrigin[1], + (cl_int)dstOrigin[2], 0 }; + setArgument(kernels_[blitType], 3, sizeof(dstOrg), dstOrg); + + cl_int copySize[4] = { (cl_int)size[0], + (cl_int)size[1], + (cl_int)size[2], 0 }; + setArgument(kernels_[blitType], 4, sizeof(copySize), copySize); + + // Create ND range object for the kernel's execution + amd::NDRangeContainer ndrange(dim, + globalWorkOffset, globalWorkSize, localWorkSize); + + // Execute the blit + address parameters = kernels_[blitType]->parameters().capture(dev()); + result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], parameters, NULL); + kernels_[blitType]->parameters().release(const_cast
(parameters), dev()); + if (releaseView) { + // todo SRD programming could be changed to avoid a stall + gpu().releaseGpuMemoryFence(); + srcView->owner()->release(); + dstView->owner()->release(); + } + + synchronize(); + + return result; +} + +void +FindPinSize( + size_t& pinSize, const amd::Coord3D& size, + size_t& rowPitch, size_t& slicePitch, const Memory& mem) +{ + pinSize = size[0] * mem.owner()->asImage()->getImageFormat().getElementSize(); + if ((rowPitch == 0) || (rowPitch == pinSize)) { + rowPitch = 0; + } + else { + pinSize = rowPitch; + } + + // Calculate the pin size, which should be equal to the copy size + for (uint i = 1; i < mem.owner()->asImage()->getDims(); ++i) { + pinSize *= size[i]; + if (i == 1) { + if ((slicePitch == 0) || (slicePitch == pinSize)) { + slicePitch = 0; + } + else { + if (mem.owner()->getType() != CL_MEM_OBJECT_IMAGE1D_ARRAY) { + pinSize = slicePitch; + } + else { + pinSize = slicePitch * size[i]; + } + } + } + } } bool KernelBlitManager::readImage( - device::Memory& srcMemory, + device::Memory& srcMemory, void* dstHost, const amd::Coord3D& origin, const amd::Coord3D& size, @@ -810,98 +1559,47 @@ KernelBlitManager::readImage( size_t slicePitch, bool entire) const { - return HsaBlitManager::readImage( - srcMemory, dstHost, origin, size, rowPitch, slicePitch, entire); -} + amd::ScopedLock k(lockXferOps_); + bool result = false; -bool -KernelBlitManager::writeBuffer( - const void* srcHost, - device::Memory& dstMemory, - const amd::Coord3D& origin, - const amd::Coord3D& size, - bool entire) const -{ - // if (setup_.disableWriteBuffer_ || dstMemory.isHostMemDirectAccess()) { - //return device::HostBlitManager::writeBuffer(srcHost, dstMemory, origin, size, - // entire); - // } + // Use host copy if memory has direct access + if (setup_.disableReadImage_ || + (gpuMem(srcMemory).isHostMemDirectAccess())) { + result = HostBlitManager::readImage(srcMemory, dstHost, + origin, size, rowPitch, slicePitch, entire); + synchronize(); + return result; + } + else { + size_t pinSize; + FindPinSize(pinSize, size, rowPitch, slicePitch, gpuMem(srcMemory)); - // Exercise HSA path for now. - return HsaBlitManager::writeBuffer(srcHost, dstMemory, origin, size, - entire); + size_t partial; + amd::Memory* amdMemory = pinHostMemory(dstHost, pinSize, partial); - amd::Buffer *srcMemory = - new (*context_) amd::Buffer(*context_, CL_MEM_USE_HOST_PTR, size[0]); + if (amdMemory == NULL) { + // Force SW copy + result = HostBlitManager::readImage(srcMemory, dstHost, + origin, size, rowPitch, slicePitch, entire); + synchronize(); + return result; + } - if (!srcMemory->create(const_cast(srcHost))) { - LogError("[OCL] Fail to create mem object for destination"); - return false; + // Readjust destination offset + const amd::Coord3D dstOrigin(partial); + + // Get device memory for this virtual device + Memory* dstMemory = dev().getRocMemory(amdMemory); + + // Copy image to buffer + result = copyImageToBuffer(srcMemory, *dstMemory, + origin, dstOrigin, size, entire, rowPitch, slicePitch); + + // Add pinned memory for a later release + gpu().addPinnedMem(amdMemory); } - device::Memory *devSrcMemory = srcMemory->getDeviceMemory(dev_); - if (devSrcMemory== NULL) { - LogError("[OCL] Fail to create device mem object for destination"); - return false; - } - - bool result = - copyBuffer(*devSrcMemory, dstMemory, amd::Coord3D(0), origin, size, entire); - - // Wait for the transfer to finish so that we could safely release the - // source memory object. - // TODO: we could remove this if issue on implicit memory registration is - // fixed by KFD, so that we could pass the pattern as SVM. - gpu().releaseGpuMemoryFence(); - - srcMemory->release(); - - return result; -} - -bool -KernelBlitManager::writeBufferRect( - const void* srcHost, - device::Memory& dstMemory, - const amd::BufferRect& hostRect, - const amd::BufferRect& bufRect, - const amd::Coord3D& size, - bool entire) const -{ - // if (setup_.disableWriteBufferRect_ || dstMemory.isHostMemDirectAccess()) { - //return device::HostBlitManager::writeBufferRect( - // srcHost, dstMemory, hostRect, bufRect, size, entire); - // } - - // Exercise HSA path for now. - return HsaBlitManager::writeBufferRect( - srcHost, dstMemory, hostRect, bufRect, size, entire); - - size_t srcSize = hostRect.start_ + hostRect.end_; - amd::Buffer *srcMemory = - new (*context_) amd::Buffer(*context_, CL_MEM_USE_HOST_PTR, srcSize); - - if (!srcMemory->create(const_cast(srcHost))) { - LogError("[OCL] Fail to create mem object for destination"); - return false; - } - - device::Memory *devSrcMemory = srcMemory->getDeviceMemory(dev_); - if (devSrcMemory== NULL) { - LogError("[OCL] Fail to create device mem object for destination"); - return false; - } - - bool result = copyBufferRect( - *devSrcMemory, dstMemory, hostRect, bufRect, size, entire); - - // Wait for the transfer to finish so that we could safely release the - // destination memory object. - // TODO: we could remove this if issue on implicit memory registration is - // fixed by KFD, so that we could pass the pattern as SVM. - gpu().releaseGpuMemoryFence(); - - srcMemory->release(); + synchronize(); return result; } @@ -909,132 +1607,81 @@ KernelBlitManager::writeBufferRect( bool KernelBlitManager::writeImage( const void* srcHost, - device::Memory& dstMemory, + device::Memory& dstMemory, const amd::Coord3D& origin, const amd::Coord3D& size, size_t rowPitch, size_t slicePitch, bool entire) const { - return HsaBlitManager::writeImage( - srcHost, dstMemory, origin, size, rowPitch, slicePitch, entire); -} + amd::ScopedLock k(lockXferOps_); + bool result = false; -bool -KernelBlitManager::copyBuffer( - device::Memory& srcMemory, - device::Memory& dstMemory, - const amd::Coord3D& srcOrigin, - const amd::Coord3D& dstOrigin, - const amd::Coord3D& sizeIn, - bool entire) const -{ - // if (setup_.disableCopyBuffer_ || - // (srcMemory.isHostMemDirectAccess() && - // dstMemory.isHostMemDirectAccess())) { - //return HsaBlitManager::copyBuffer( - // srcMemory, dstMemory, srcOrigin, dstOrigin, sizeIn, entire); - // } + // Use host copy if memory has direct access + if (setup_.disableWriteImage_|| gpuMem(dstMemory).isHostMemDirectAccess()) { + result = HostBlitManager::writeImage( + srcHost, dstMemory, origin, size, rowPitch, slicePitch, entire); + synchronize(); + return result; + } + else { + size_t pinSize; + FindPinSize(pinSize, size, rowPitch, slicePitch, gpuMem(dstMemory)); - // Exercise HSA path for now. - return HsaBlitManager::copyBuffer( - srcMemory, dstMemory, srcOrigin, dstOrigin, sizeIn, entire); + size_t partial; + amd::Memory* amdMemory = pinHostMemory(srcHost, pinSize, partial); - uint blitType = BlitCopyBuffer; - size_t dim = 1; - size_t globalWorkOffset[3] = { 0, 0, 0 }; - size_t globalWorkSize = 0; - size_t localWorkSize = 0; - - const static uint CopyBuffAlignment[3] = { 16, 4, 1 }; - amd::Coord3D size(sizeIn[0], sizeIn[1], sizeIn[2]); - - bool aligned; - uint i; - for (i = 0; i < 3; ++i) { - // Check source alignments - aligned = ((srcOrigin[0] % CopyBuffAlignment[i]) == 0); - // Check destination alignments - aligned &= ((dstOrigin[0] % CopyBuffAlignment[i]) == 0); - // Check copy size alignment in the first dimension - aligned &= ((sizeIn[0] % CopyBuffAlignment[i]) == 0); - - if (aligned) { - if (CopyBuffAlignment[i] != 1) { - blitType = BlitCopyBufferAligned; - } - break; + if (amdMemory == NULL) { + // Force SW copy + result = HostBlitManager::writeImage( + srcHost, dstMemory, origin, size, rowPitch, slicePitch, entire); + synchronize(); + return result; } + + // Readjust destination offset + const amd::Coord3D srcOrigin(partial); + + // Get device memory for this virtual device + Memory* srcMemory = dev().getRocMemory(amdMemory); + + // Copy image to buffer + result = copyBufferToImage(*srcMemory, dstMemory, + srcOrigin, origin, size, entire, rowPitch, slicePitch); + + // Add pinned memory for a later release + gpu().addPinnedMem(amdMemory); } - cl_uint remain; - if (blitType == BlitCopyBufferAligned) { - size.c[0] /= CopyBuffAlignment[i]; - } - else { - remain = size[0] % 4; - size.c[0] /= 4; - size.c[0] += 1; - } + synchronize(); - // Program the dispatch dimensions - localWorkSize = 256; - globalWorkSize = amd::alignUp(size[0] , 256); - - // Program kernels arguments for the blit operation - cl_mem clmem = ((cl_mem) as_cl(srcMemory.owner())); - kernels_[blitType]->parameters().set(0, sizeof(cl_mem), &clmem); - clmem = ((cl_mem) as_cl(dstMemory.owner())); - kernels_[blitType]->parameters().set(1, sizeof(cl_mem), &clmem); - // Program source origin - cl_ulong srcOffset = srcOrigin[0] / CopyBuffAlignment[i]; - kernels_[blitType]->parameters().set(2, sizeof(srcOffset), &srcOffset); - - // Program destination origin - cl_ulong dstOffset = dstOrigin[0] / CopyBuffAlignment[i]; - kernels_[blitType]->parameters().set(3, sizeof(dstOffset), &dstOffset); - - cl_ulong copySize = size[0]; - kernels_[blitType]->parameters().set(4, sizeof(copySize), ©Size); - - if (blitType == BlitCopyBufferAligned) { - cl_int alignment = CopyBuffAlignment[i]; - kernels_[blitType]->parameters().set(5, sizeof(alignment), &alignment); - } - else { - kernels_[blitType]->parameters().set(5, sizeof(remain), &remain); - } - - // Create ND range object for the kernel's execution - amd::NDRangeContainer ndrange( - 1, globalWorkOffset, &globalWorkSize, &localWorkSize); - - // Execute the blit - address parameters = kernels_[blitType]->parameters().capture(dev_); - bool result = gpu().submitKernelInternal( - ndrange, *kernels_[blitType], parameters, NULL); - kernels_[blitType]->parameters().release(const_cast
(parameters), dev_); return result; } bool KernelBlitManager::copyBufferRect( - device::Memory& srcMemory, - device::Memory& dstMemory, - const amd::BufferRect& srcRectIn, - const amd::BufferRect& dstRectIn, + device::Memory& srcMemory, + device::Memory& dstMemory, + const amd::BufferRect& srcRectIn, + const amd::BufferRect& dstRectIn, const amd::Coord3D& sizeIn, bool entire) const { - // if (setup_.disableCopyBuffer_ || - // (srcMemory.isHostMemDirectAccess() && dstMemory.isHostMemDirectAccess())) { - //return HsaBlitManager::copyBufferRect( - // srcMemory, dstMemory, srcRectIn, dstRectIn, sizeIn, entire); - // } + amd::ScopedLock k(lockXferOps_); + bool result = false; + bool rejected = false; - // Exercise HSA path for now. - return HsaBlitManager::copyBufferRect( - srcMemory, dstMemory, srcRectIn, dstRectIn, sizeIn, entire); + // Fall into the ROC path for rejected transfers + if (setup_.disableCopyBufferRect_ || + gpuMem(srcMemory).isHostMemDirectAccess() || gpuMem(dstMemory).isHostMemDirectAccess()) { + result = DmaBlitManager::copyBufferRect(srcMemory, dstMemory, + srcRectIn, dstRectIn, sizeIn, entire); + + if (result) { + synchronize(); + return result; + } + } uint blitType = BlitCopyBufferRect; size_t dim = 3; @@ -1110,300 +1757,269 @@ KernelBlitManager::copyBufferRect( // Program kernels arguments for the blit operation - cl_mem clmem = ((cl_mem) as_cl(srcMemory.owner())); - kernels_[blitType]->parameters().set(0, sizeof(cl_mem), &clmem); - clmem = ((cl_mem) as_cl(dstMemory.owner())); - kernels_[blitType]->parameters().set(1, sizeof(cl_mem), &clmem); - cl_ulong src[4] = {srcRect.rowPitch_, - srcRect.slicePitch_, - srcRect.start_, 0 }; - kernels_[blitType]->parameters().set(2, sizeof(src), src); - cl_ulong dst[4] = {dstRect.rowPitch_, - dstRect.slicePitch_, - dstRect.start_, 0 }; - kernels_[blitType]->parameters().set(3, sizeof(dst), dst); - cl_ulong copySize[4] = {size[0], - size[1], - size[2], - CopyRectAlignment[i] }; - kernels_[blitType]->parameters().set(4, sizeof(copySize), copySize); + cl_mem mem = as_cl(srcMemory.owner()); + setArgument(kernels_[blitType], 0, sizeof(cl_mem), &mem); + mem = as_cl(dstMemory.owner()); + setArgument(kernels_[blitType], 1, sizeof(cl_mem), &mem); + cl_ulong src[4] = { srcRect.rowPitch_, + srcRect.slicePitch_, + srcRect.start_, 0 }; + setArgument(kernels_[blitType], 2, sizeof(src), src); + cl_ulong dst[4] = { dstRect.rowPitch_, + dstRect.slicePitch_, + dstRect.start_, 0 }; + setArgument(kernels_[blitType], 3, sizeof(dst), dst); + cl_ulong copySize[4] = { size[0], size[1], size[2], CopyRectAlignment[i] }; + setArgument(kernels_[blitType], 4, sizeof(copySize), copySize); // Create ND range object for the kernel's execution amd::NDRangeContainer ndrange(dim, globalWorkOffset, globalWorkSize, localWorkSize); // Execute the blit - address parameters = kernels_[blitType]->parameters().capture(dev_); - bool result = gpu().submitKernelInternal( - ndrange, *kernels_[blitType], parameters, NULL); - kernels_[blitType]->parameters().release(const_cast
(parameters), dev_); + address parameters = kernels_[blitType]->parameters().capture(dev()); + result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], parameters, NULL); + kernels_[blitType]->parameters().release(const_cast
(parameters), dev()); + + synchronize(); + return result; } bool -KernelBlitManager::copyImageToBuffer( - device::Memory& srcMemory, - device::Memory& dstMemory, - const amd::Coord3D& srcOrigin, - const amd::Coord3D& dstOrigin, - const amd::Coord3D& size, - bool entire, - size_t rowPitch, - size_t slicePitch) const -{ - if (dstMemory.isHostMemDirectAccess()) { - return HsaBlitManager::copyImageToBuffer(srcMemory, dstMemory, srcOrigin, - dstOrigin, size, entire, rowPitch, - slicePitch); - } - - amd::Image::Format oldFormat = srcMemory.owner()->asImage()->getImageFormat(); - amd::Image::Format newFormat = filterFormat(oldFormat); - bool useView = false; - - device::Memory* srcView = &srcMemory; - if (oldFormat != newFormat) { - srcView = createImageView(srcMemory, newFormat); - useView = true; - } - - roc::Image& srcImage = static_cast(*srcView); - - amd::Image* image = srcImage.owner()->asImage(); - uint blitType = 0; - blitType = BlitCopyImageToBuffer; - size_t dim = 0; - size_t globalWorkOffset[3] = {0, 0, 0}; - size_t globalWorkSize[3]; - size_t localWorkSize[3]; - - // Program the kernels workload depending on the blit dimensions - const size_t imageDims = srcImage.owner()->asImage()->getDims(); - dim = 3; - // Find the current blit type - if (imageDims == 1) { - globalWorkSize[0] = amd::alignUp(size[0], 256); - globalWorkSize[1] = amd::alignUp(size[1], 1); - globalWorkSize[2] = amd::alignUp(size[2], 1); - localWorkSize[0] = 256; - localWorkSize[1] = localWorkSize[2] = 1; - } else if (imageDims == 2) { - globalWorkSize[0] = amd::alignUp(size[0], 16); - globalWorkSize[1] = amd::alignUp(size[1], 16); - globalWorkSize[2] = amd::alignUp(size[2], 1); - localWorkSize[0] = localWorkSize[1] = 16; - localWorkSize[2] = 1; - } else { - globalWorkSize[0] = amd::alignUp(size[0], 8); - globalWorkSize[1] = amd::alignUp(size[1], 8); - globalWorkSize[2] = amd::alignUp(size[2], 4); - localWorkSize[0] = localWorkSize[1] = 8; - localWorkSize[2] = 4; - } - - // Program kernels arguments for the blit operation - cl_mem clmem = ((cl_mem)as_cl(srcImage.owner())); - kernels_[blitType]->parameters().set(0, sizeof(cl_mem), &clmem); - clmem = ((cl_mem)as_cl(dstMemory.owner())); - kernels_[blitType]->parameters().set(1, sizeof(cl_mem), &clmem); - - // Update extra paramters for USHORT and UBYTE pointers. - // Only then compiler can optimize the kernel to use - // UAV Raw for other writes - kernels_[blitType]->parameters().set(2, sizeof(cl_mem), &clmem); - kernels_[blitType]->parameters().set(3, sizeof(cl_mem), &clmem); - - cl_int srcOrg[4] = {(cl_int)srcOrigin[0], (cl_int)srcOrigin[1], - (cl_int)srcOrigin[2], 0}; - cl_int copySize[4] = {(cl_int)size[0], (cl_int)size[1], (cl_int)size[2], 0}; - - kernels_[blitType]->parameters().set(4, sizeof(srcOrg), srcOrg); - - const size_t elementSize = - srcImage.owner()->asImage()->getImageFormat().getElementSize(); - const size_t numChannels = - srcImage.owner()->asImage()->getImageFormat().getNumChannels(); - - // 1 element granularity for writes by default - cl_int granularity = 1; - if (elementSize == 2) { - granularity = 2; - } else if (elementSize >= 4) { - granularity = 4; - } - CondLog(((dstOrigin[0] % granularity) != 0), "Unaligned offset in blit!"); - cl_ulong dstOrg[4] = {dstOrigin[0] / granularity, dstOrigin[1], dstOrigin[2], - 0}; - kernels_[blitType]->parameters().set(5, sizeof(dstOrg), dstOrg); - kernels_[blitType]->parameters().set(6, sizeof(copySize), copySize); - - // Program memory format - uint multiplier = elementSize / sizeof(uint32_t); - multiplier = (multiplier == 0) ? 1 : multiplier; - cl_uint format[4] = {(cl_uint)numChannels, - (cl_uint)(elementSize / numChannels), multiplier, 0}; - kernels_[blitType]->parameters().set(7, sizeof(format), format); - - // Program row and slice pitches - cl_ulong pitch[4] = {0}; - CalcRowSlicePitches(pitch, copySize, rowPitch, slicePitch, srcImage); - kernels_[blitType]->parameters().set(8, sizeof(pitch), pitch); - - // Create ND range object for the kernel's execution - amd::NDRangeContainer ndrange(dim, globalWorkOffset, globalWorkSize, - localWorkSize); - - // Execute the blit - address parameters = kernels_[blitType]->parameters().capture(dev_); - bool result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], - parameters, NULL); - - gpu().releaseGpuMemoryFence(); - - kernels_[blitType]->parameters().release(const_cast
(parameters), - dev_); - - if (useView) { - srcView->owner()->release(); - } - - return result; -} - -bool KernelBlitManager::copyBufferToImage(device::Memory& srcMemory, - device::Memory& dstMemory, - const amd::Coord3D& srcOrigin, - const amd::Coord3D& dstOrigin, - const amd::Coord3D& size, bool entire, - size_t rowPitch, - size_t slicePitch) const { - if (srcMemory.isHostMemDirectAccess()) { - return HsaBlitManager::copyBufferToImage(srcMemory, dstMemory, srcOrigin, - dstOrigin, size, entire, rowPitch, - slicePitch); - } - - amd::Image::Format oldFormat = dstMemory.owner()->asImage()->getImageFormat(); - amd::Image::Format newFormat = filterFormat(oldFormat); - bool useView = false; - - device::Memory* dstView = &dstMemory; - if (oldFormat != newFormat) { - dstView = createImageView(dstMemory, newFormat); - useView = true; - } - - roc::Image& dstImage = static_cast(*dstView); - - // Use a common blit type with three dimensions by default - uint blitType = BlitCopyBufferToImage; - size_t dim = 0; - size_t globalWorkOffset[3] = {0, 0, 0}; - size_t globalWorkSize[3]; - size_t localWorkSize[3]; - - // Program the kernels workload depending on the blit dimensions - const size_t imageDims = dstImage.owner()->asImage()->getDims(); - dim = 3; - if (imageDims == 1) { - globalWorkSize[0] = amd::alignUp(size[0], 256); - globalWorkSize[1] = amd::alignUp(size[1], 1); - globalWorkSize[2] = amd::alignUp(size[2], 1); - localWorkSize[0] = 256; - localWorkSize[1] = localWorkSize[2] = 1; - } else if (imageDims == 2) { - globalWorkSize[0] = amd::alignUp(size[0], 16); - globalWorkSize[1] = amd::alignUp(size[1], 16); - globalWorkSize[2] = amd::alignUp(size[2], 1); - localWorkSize[0] = localWorkSize[1] = 16; - localWorkSize[2] = 1; - } else { - globalWorkSize[0] = amd::alignUp(size[0], 8); - globalWorkSize[1] = amd::alignUp(size[1], 8); - globalWorkSize[2] = amd::alignUp(size[2], 4); - localWorkSize[0] = localWorkSize[1] = 8; - localWorkSize[2] = 4; - } - - // Program kernels arguments for the blit operation - cl_mem clmem = ((cl_mem)as_cl(srcMemory.owner())); - kernels_[blitType]->parameters().set(0, sizeof(cl_mem), &clmem); - clmem = ((cl_mem)as_cl(dstImage.owner())); - kernels_[blitType]->parameters().set(1, sizeof(cl_mem), &clmem); - - const size_t elementSize = - dstImage.owner()->asImage()->getImageFormat().getElementSize(); - const size_t numChannels = - dstImage.owner()->asImage()->getImageFormat().getNumChannels(); - - // 1 element granularity for writes by default - cl_int granularity = 1; - if (elementSize == 2) { - granularity = 2; - } else if (elementSize >= 4) { - granularity = 4; - } - CondLog(((srcOrigin[0] % granularity) != 0), "Unaligned offset in blit!"); - cl_ulong srcOrg[4] = {srcOrigin[0] / granularity, srcOrigin[1], srcOrigin[2], - 0}; - kernels_[blitType]->parameters().set(2, sizeof(srcOrg), srcOrg); - - cl_int dstOrg[4] = {(cl_int)dstOrigin[0], (cl_int)dstOrigin[1], - (cl_int)dstOrigin[2], 0}; - cl_int copySize[4] = {(cl_int)size[0], (cl_int)size[1], (cl_int)size[2], 0}; - - kernels_[blitType]->parameters().set(3, sizeof(dstOrg), dstOrg); - kernels_[blitType]->parameters().set(4, sizeof(copySize), copySize); - - // Program memory format - uint multiplier = elementSize / sizeof(uint32_t); - multiplier = (multiplier == 0) ? 1 : multiplier; - cl_uint format[4] = {(cl_uint)numChannels, - (cl_uint)(elementSize / numChannels), multiplier, 0}; - kernels_[blitType]->parameters().set(5, sizeof(format), format); - - // Program row and slice pitches - cl_ulong pitch[4] = {0}; - CalcRowSlicePitches(pitch, copySize, rowPitch, slicePitch, dstImage); - kernels_[blitType]->parameters().set(6, sizeof(pitch), pitch); - - // Create ND range object for the kernel's execution - amd::NDRangeContainer ndrange(dim, globalWorkOffset, globalWorkSize, - localWorkSize); - - // Execute the blit - address parameters = kernels_[blitType]->parameters().capture(dev_); - bool result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], - parameters, NULL); - - gpu().releaseGpuMemoryFence(); - - kernels_[blitType]->parameters().release(const_cast
(parameters), - dev_); - - if (useView) { - dstView->owner()->release(); - } - - return result; -} - -bool -KernelBlitManager::copyImage( - device::Memory& srcMemory, - device::Memory& dstMemory, - const amd::Coord3D& srcOrigin, - const amd::Coord3D& dstOrigin, +KernelBlitManager::readBuffer( + device::Memory& srcMemory, + void* dstHost, + const amd::Coord3D& origin, const amd::Coord3D& size, bool entire) const { - return HsaBlitManager::copyImage( - srcMemory, dstMemory, srcOrigin, dstOrigin, size, entire); + amd::ScopedLock k(lockXferOps_); + bool result = false; + // Use host copy if memory has direct access + if (setup_.disableReadBuffer_ || + (gpuMem(srcMemory).isHostMemDirectAccess())) { + result = HostBlitManager::readBuffer( + srcMemory, dstHost, origin, size, entire); + synchronize(); + return result; + } + else { + size_t pinSize = size[0]; + // Check if a pinned transfer can be executed with a single pin + if ((pinSize <= dev().settings().pinnedXferSize_) && + (pinSize > MinSizeForPinnedTransfer)) { + size_t partial; + amd::Memory* amdMemory = pinHostMemory(dstHost, pinSize, partial); + + if (amdMemory == NULL) { + // Force SW copy + result = HostBlitManager::readBuffer( + srcMemory, dstHost, origin, size, entire); + synchronize(); + return result; + } + + // Readjust host mem offset + amd::Coord3D dstOrigin(partial); + + // Get device memory for this virtual device + Memory* dstMemory = dev().getRocMemory(amdMemory); + + // Copy image to buffer + result = copyBuffer(srcMemory, *dstMemory, + origin, dstOrigin, size, entire); + + // Add pinned memory for a later release + gpu().addPinnedMem(amdMemory); + } + else { + result = DmaBlitManager::readBuffer( + srcMemory, dstHost, origin, size, entire); + } + } + + synchronize(); + + return result; +} + +bool +KernelBlitManager::readBufferRect( + device::Memory& srcMemory, + void* dstHost, + const amd::BufferRect& bufRect, + const amd::BufferRect& hostRect, + const amd::Coord3D& size, + bool entire) const +{ + amd::ScopedLock k(lockXferOps_); + bool result = false; + + // Use host copy if memory has direct access + if (setup_.disableReadBufferRect_ || gpuMem(srcMemory).isHostMemDirectAccess()) { + result = HostBlitManager::readBufferRect( + srcMemory, dstHost, bufRect, hostRect, size, entire); + synchronize(); + return result; + } + else { + size_t pinSize = hostRect.start_ + hostRect.end_; + size_t partial; + amd::Memory* amdMemory = pinHostMemory(dstHost, pinSize, partial); + + if (amdMemory == NULL) { + // Force SW copy + result = HostBlitManager::readBufferRect( + srcMemory, dstHost, bufRect, hostRect, size, entire); + synchronize(); + return result; + } + + // Readjust host mem offset + amd::BufferRect rect; + rect.rowPitch_ = hostRect.rowPitch_; + rect.slicePitch_ = hostRect.slicePitch_; + rect.start_ = hostRect.start_ + partial; + rect.end_ = hostRect.end_; + + // Get device memory for this virtual device + Memory* dstMemory = dev().getRocMemory(amdMemory); + + // Copy image to buffer + result = copyBufferRect(srcMemory, *dstMemory, + bufRect, rect, size, entire); + + // Add pinned memory for a later release + gpu().addPinnedMem(amdMemory); + } + + synchronize(); + + return result; +} + +bool +KernelBlitManager::writeBuffer( + const void* srcHost, + device::Memory& dstMemory, + const amd::Coord3D& origin, + const amd::Coord3D& size, + bool entire) const +{ + amd::ScopedLock k(lockXferOps_); + bool result = false; + + // Use host copy if memory has direct access + if (setup_.disableWriteBuffer_ || gpuMem(dstMemory).isHostMemDirectAccess()) { + result = HostBlitManager::writeBuffer( + srcHost, dstMemory, origin, size, entire); + synchronize(); + return result; + } + else { + size_t pinSize = size[0]; + + // Check if a pinned transfer can be executed with a single pin + if ((pinSize <= dev().settings().pinnedXferSize_) && + (pinSize > MinSizeForPinnedTransfer)) { + size_t partial; + amd::Memory* amdMemory = pinHostMemory(srcHost, pinSize, partial); + + if (amdMemory == NULL) { + // Force SW copy + result = HostBlitManager::writeBuffer( + srcHost, dstMemory, origin, size, entire); + synchronize(); + return result; + } + + // Readjust destination offset + const amd::Coord3D srcOrigin(partial); + + // Get device memory for this virtual device + Memory* srcMemory = dev().getRocMemory(amdMemory); + + // Copy buffer rect + result = copyBuffer(*srcMemory, dstMemory, + srcOrigin, origin, size, entire); + + // Add pinned memory for a later release + gpu().addPinnedMem(amdMemory); + } + else { + result = DmaBlitManager::writeBuffer( + srcHost, dstMemory, origin, size, entire); + } + } + + synchronize(); + + return result; +} + +bool +KernelBlitManager::writeBufferRect( + const void* srcHost, + device::Memory& dstMemory, + const amd::BufferRect& hostRect, + const amd::BufferRect& bufRect, + const amd::Coord3D& size, + bool entire) const +{ + amd::ScopedLock k(lockXferOps_); + bool result = false; + + // Use host copy if memory has direct access + if (setup_.disableWriteBufferRect_ || + gpuMem(dstMemory).isHostMemDirectAccess()) { + result = HostBlitManager::writeBufferRect( + srcHost, dstMemory, hostRect, bufRect, size, entire); + synchronize(); + return result; + } + else { + size_t pinSize = hostRect.start_ + hostRect.end_; + size_t partial; + amd::Memory* amdMemory = pinHostMemory(srcHost, pinSize, partial); + + if (amdMemory == NULL) { + // Force DMA copy with staging + result = DmaBlitManager::writeBufferRect( + srcHost, dstMemory, hostRect, bufRect, size, entire); + synchronize(); + return result; + } + + // Readjust destination offset + const amd::Coord3D srcOrigin(partial); + + // Get device memory for this virtual device + Memory* srcMemory = dev().getRocMemory(amdMemory); + + // Readjust host mem offset + amd::BufferRect rect; + rect.rowPitch_ = hostRect.rowPitch_; + rect.slicePitch_ = hostRect.slicePitch_; + rect.start_ = hostRect.start_ + partial; + rect.end_ = hostRect.end_; + + // Copy buffer rect + result = copyBufferRect(*srcMemory, dstMemory, + rect, bufRect, size, entire); + + // Add pinned memory for a later release + gpu().addPinnedMem(amdMemory); + } + + synchronize(); + + return result; } bool KernelBlitManager::fillBuffer( - device::Memory& memory, + device::Memory& memory, const void* pattern, size_t patternSize, const amd::Coord3D& origin, @@ -1411,72 +2027,164 @@ KernelBlitManager::fillBuffer( bool entire ) const { - if (setup_.disableFillBuffer_ || memory.isHostMemDirectAccess()) { - return HostBlitManager::fillBuffer(memory, pattern, patternSize, origin, - size, entire); - } + amd::ScopedLock k(lockXferOps_); + bool result = false; - uint fillType = FillBuffer; - size_t globalWorkOffset[3] = { 0, 0, 0 }; - cl_ulong fillSize = size[0] / patternSize; - size_t globalWorkSize = amd::alignUp(fillSize, 256); - size_t localWorkSize = 256; - bool dwordAligned = - ((patternSize % sizeof(uint32_t)) == 0) ? true : false; - - // Program kernels arguments for the fill operation - if (dwordAligned) { - kernels_[fillType]->parameters().set(0, sizeof(cl_mem), NULL); - cl_mem clmem = ((cl_mem) as_cl(memory.owner())); - kernels_[fillType]->parameters().set(1, sizeof(cl_mem), &clmem); + // Use host fill if memory has direct access + if (setup_.disableFillBuffer_ || + gpuMem(memory).isHostMemDirectAccess()) { + result = HostBlitManager::fillBuffer( + memory, pattern, patternSize, origin, size, entire); + synchronize(); + return result; } else { - cl_mem clmem = ((cl_mem) as_cl(memory.owner())); - kernels_[fillType]->parameters().set(0, sizeof(cl_mem), &clmem); - kernels_[fillType]->parameters().set(1, sizeof(cl_mem), NULL); + uint fillType = FillBuffer; + size_t globalWorkOffset[3] = { 0, 0, 0 }; + cl_ulong fillSize = size[0] / patternSize; + size_t globalWorkSize = amd::alignUp(fillSize, 256); + size_t localWorkSize = 256; + bool dwordAligned = + ((patternSize % sizeof(uint32_t)) == 0) ? true : false; + + // Program kernels arguments for the fill operation + cl_mem mem = as_cl(memory.owner()); + if (dwordAligned) { + setArgument(kernels_[fillType], 0, sizeof(cl_mem), NULL); + setArgument(kernels_[fillType], 1, sizeof(cl_mem), &mem); + } + else { + setArgument(kernels_[fillType], 0, sizeof(cl_mem), &mem); + setArgument(kernels_[fillType], 1, sizeof(cl_mem), NULL); + } + Memory* gpuCB = dev().getRocMemory(constantBuffer_); + if (gpuCB == NULL) { + return false; + } + void* constBuf = gpuCB->getDeviceMemory(); + memcpy(constBuf, pattern, patternSize); + + mem = as_cl(gpuCB->owner()); + setArgument(kernels_[fillType], 2, sizeof(cl_mem), &mem); + cl_ulong offset = origin[0]; + if (dwordAligned) { + patternSize /= sizeof(uint32_t); + offset /= sizeof(uint32_t); + } + setArgument(kernels_[fillType], 3, sizeof(cl_uint), &patternSize); + setArgument(kernels_[fillType], 4, sizeof(offset), &offset); + setArgument(kernels_[fillType], 5, sizeof(fillSize), &fillSize); + + // Create ND range object for the kernel's execution + amd::NDRangeContainer ndrange(1, + globalWorkOffset, &globalWorkSize, &localWorkSize); + + // Execute the blit + address parameters = kernels_[fillType]->parameters().capture(dev()); + result = gpu().submitKernelInternal(ndrange, *kernels_[fillType], parameters, NULL); + kernels_[fillType]->parameters().release(const_cast
(parameters), dev()); } - amd::Buffer *fillMemory = - new (*context_) amd::Buffer(*context_, CL_MEM_USE_HOST_PTR, patternSize); + synchronize(); - if (!fillMemory->create(const_cast(pattern))) { - LogError("[OCL] Fail to create mem object for destination"); - return false; + return result; +} + +bool +KernelBlitManager::copyBuffer( + device::Memory& srcMemory, + device::Memory& dstMemory, + const amd::Coord3D& srcOrigin, + const amd::Coord3D& dstOrigin, + const amd::Coord3D& sizeIn, + bool entire) const +{ + amd::ScopedLock k(lockXferOps_); + bool result = false; + + if (!gpuMem(srcMemory).isHostMemDirectAccess() && + !gpuMem(dstMemory).isHostMemDirectAccess()) { + uint blitType = BlitCopyBuffer; + size_t dim = 1; + size_t globalWorkOffset[3] = { 0, 0, 0 }; + size_t globalWorkSize = 0; + size_t localWorkSize = 0; + + // todo LC shows much better performance with the unaligned version + const static uint CopyBuffAlignment[3] = { 1/*16*/, 1/*4*/, 1 }; + amd::Coord3D size(sizeIn[0], sizeIn[1], sizeIn[2]); + + bool aligned = false; + uint i; + for (i = 0; i < sizeof(CopyBuffAlignment) / sizeof(uint); i++) { + // Check source alignments + aligned = ((srcOrigin[0] % CopyBuffAlignment[i]) == 0); + // Check destination alignments + aligned &= ((dstOrigin[0] % CopyBuffAlignment[i]) == 0); + // Check copy size alignment in the first dimension + aligned &= ((sizeIn[0] % CopyBuffAlignment[i]) == 0); + + if (aligned) { + if (CopyBuffAlignment[i] != 1) { + blitType = BlitCopyBufferAligned; + } + break; + } + } + + cl_uint remain; + if (blitType == BlitCopyBufferAligned) { + size.c[0] /= CopyBuffAlignment[i]; + } + else { + remain = size[0] % 4; + size.c[0] /= 4; + size.c[0] += 1; + } + + // Program the dispatch dimensions + localWorkSize = 256; + globalWorkSize = amd::alignUp(size[0] , 256); + + // Program kernels arguments for the blit operation + cl_mem mem = as_cl(srcMemory.owner()); + setArgument(kernels_[blitType], 0, sizeof(cl_mem), &mem); + mem = as_cl(dstMemory.owner()); + setArgument(kernels_[blitType], 1, sizeof(cl_mem), &mem); + // Program source origin + cl_ulong srcOffset = srcOrigin[0] / CopyBuffAlignment[i];; + setArgument(kernels_[blitType], 2, sizeof(srcOffset), &srcOffset); + + // Program destinaiton origin + cl_ulong dstOffset = dstOrigin[0] / CopyBuffAlignment[i];; + setArgument(kernels_[blitType], 3, sizeof(dstOffset), &dstOffset); + + cl_ulong copySize = size[0]; + setArgument(kernels_[blitType], 4, sizeof(copySize), ©Size); + + if (blitType == BlitCopyBufferAligned) { + cl_int alignment = CopyBuffAlignment[i]; + setArgument(kernels_[blitType], 5, sizeof(alignment), &alignment); + } + else { + setArgument(kernels_[blitType], 5, sizeof(remain), &remain); + } + + // Create ND range object for the kernel's execution + amd::NDRangeContainer ndrange(1, + globalWorkOffset, &globalWorkSize, &localWorkSize); + + // Execute the blit + address parameters = kernels_[blitType]->parameters().capture(dev()); + result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], parameters, NULL); + kernels_[blitType]->parameters().release(const_cast
(parameters), dev()); + } + else { + result = DmaBlitManager::copyBuffer( + srcMemory, dstMemory, srcOrigin, dstOrigin, sizeIn, entire); } - if (fillMemory->getDeviceMemory(dev_) == NULL) { - LogError("[OCL] Fail to create device mem object for destination"); - return false; - } - - cl_mem clmem = ((cl_mem) as_cl(fillMemory)); - kernels_[fillType]->parameters().set(2, sizeof(cl_mem), &clmem); - cl_ulong offset = origin[0]; - if (dwordAligned) { - patternSize /= sizeof(uint32_t); - offset /= sizeof(uint32_t); - } - kernels_[fillType]->parameters().set(3, sizeof(cl_uint), &patternSize); - kernels_[fillType]->parameters().set(4, sizeof(offset), &offset); - kernels_[fillType]->parameters().set(5, sizeof(fillSize), &fillSize); - - // Create ND range object for the kernel's execution - amd::NDRangeContainer ndrange(1, - globalWorkOffset, &globalWorkSize, &localWorkSize); - - // Execute the blit - address parameters = kernels_[fillType]->parameters().capture(dev_); - bool result = gpu().submitKernelInternal( - ndrange, *kernels_[fillType], parameters, NULL); - kernels_[fillType]->parameters().release(const_cast
(parameters), dev_); - - // Wait for the transfer to finish so that we could safely release the - // fill memory object. - // TODO: we could remove this if issue on implicit memory registration is - // fixed by KFD, so that we could pass the pattern as SVM. - gpu().releaseGpuMemoryFence(); - - fillMemory->release(); + synchronize(); return result; } @@ -1490,48 +2198,249 @@ KernelBlitManager::fillImage( bool entire ) const { - return HsaBlitManager::fillImage(memory, pattern, origin, size, entire); -} + amd::ScopedLock k(lockXferOps_); + bool result = false; -bool -KernelBlitManager::create(amd::Device& device) -{ - if (!HsaBlitManager::create(device)) { - return false; - } - if (!createProgram(static_cast(device))) { - return false; + // Use host fill if memory has direct access + if (setup_.disableFillImage_ || + gpuMem(memory).isHostMemDirectAccess()) { + result = HostBlitManager::fillImage( + memory, pattern, origin, size, entire); + synchronize(); + return result; } - return true; -} + uint fillType; + size_t dim = 0; + size_t globalWorkOffset[3] = { 0, 0, 0 }; + size_t globalWorkSize[3]; + size_t localWorkSize[3]; + Memory* memView = &gpuMem(memory); + amd::Image::Format newFormat(gpuMem(memory).owner()->asImage()->getImageFormat()); -bool -KernelBlitManager::createProgram(Device& device) -{ - // Save context and program for this device - context_ = device.blitProgram()->context_; - context_->retain(); - program_ = device.blitProgram()->program_; - program_->retain(); + // Program the kernels workload depending on the fill dimensions + fillType = FillImage; + dim = 3; - bool result = true; + void *newpattern = const_cast(pattern); + cl_uint4 iFillColor; - // Create kernel objects for all blits - for (uint i = 0; i < BlitTotal; ++i) { - const amd::Symbol* symbol = program_->findSymbol(BlitName[i]); - if (symbol == NULL) { - result = false; - continue; + bool rejected = false; + bool releaseView = false; + + // For depth, we need to create a view + if (newFormat.image_channel_order == CL_sRGBA) { + // Find unsupported data type + for (uint i = 0; i < RejectedFormatDataTotal; ++i) { + if (RejectedData[i].clOldType_ == newFormat.image_channel_data_type) { + newFormat.image_channel_data_type = RejectedData[i].clNewType_; + rejected = true; + break; + } } - kernels_[i] = new amd::Kernel(*program_, *symbol, BlitName[i]); - if (kernels_[i] == NULL) { - result = false; - continue; + + if (newFormat.image_channel_order == CL_sRGBA) { + // Converting a linear RGB floating-point color value to a 8-bit unsigned integer sRGB value because hw is not support write_imagef for sRGB. + float *fColor = static_cast(newpattern); + iFillColor.s[0] = sRGBmap(fColor[0]); + iFillColor.s[1] = sRGBmap(fColor[1]); + iFillColor.s[2] = sRGBmap(fColor[2]); + iFillColor.s[3] = (cl_uint)(fColor[3]*255.0f); + newpattern = static_cast(&iFillColor); + for (uint i = 0; i < RejectedFormatChannelTotal; ++i) { + if (RejectedOrder[i].clOldType_ == newFormat.image_channel_order) { + newFormat.image_channel_order = RejectedOrder[i].clNewType_; + rejected = true; + break; + } + } } } + // If the image format was rejected, then attempt to create a view + if (rejected) { + memView = createView(gpuMem(memory), newFormat); + if (memView != NULL) { + rejected = false; + releaseView = true; + } + } + + if (rejected) { + return DmaBlitManager::fillImage(memory, pattern, origin, size, entire); + } + + // Perform workload split to allow multiple operations in a single thread + globalWorkSize[0] = (size[0] + TransferSplitSize - 1) / TransferSplitSize; + // Find the current blit type + if (memView->owner()->asImage()->getDims() == 1) { + globalWorkSize[0] = amd::alignUp(globalWorkSize[0], 256); + globalWorkSize[1] = amd::alignUp(size[1], 1); + globalWorkSize[2] = amd::alignUp(size[2], 1); + localWorkSize[0] = 256; + localWorkSize[1] = localWorkSize[2] = 1; + } + else if (memView->owner()->asImage()->getDims()== 2) { + globalWorkSize[0] = amd::alignUp(globalWorkSize[0], 16); + globalWorkSize[1] = amd::alignUp(size[1], 16); + globalWorkSize[2] = amd::alignUp(size[2], 1); + localWorkSize[0] = localWorkSize[1] = 16; + localWorkSize[2] = 1; + } + else { + globalWorkSize[0] = amd::alignUp(globalWorkSize[0], 8); + globalWorkSize[1] = amd::alignUp(size[1], 8); + globalWorkSize[2] = amd::alignUp(size[2], 4); + localWorkSize[0] = localWorkSize[1] = 8; + localWorkSize[2] = 4; + } + + // Program kernels arguments for the blit operation + cl_mem mem = as_cl(memView->owner()); + setArgument(kernels_[fillType], 0, sizeof(cl_mem), &mem); + setArgument(kernels_[fillType], 1, sizeof(cl_float4), newpattern); + setArgument(kernels_[fillType], 2, sizeof(cl_int4), newpattern); + setArgument(kernels_[fillType], 3, sizeof(cl_uint4), newpattern); + + cl_int fillOrigin[4] = { (cl_int)origin[0], + (cl_int)origin[1], + (cl_int)origin[2], 0 }; + cl_int fillSize[4] = { (cl_int)size[0], + (cl_int)size[1], + (cl_int)size[2], 0 }; + setArgument(kernels_[fillType], 4, sizeof(fillOrigin), fillOrigin); + setArgument(kernels_[fillType], 5, sizeof(fillSize), fillSize); + + // Find the type of image + uint32_t type = 0; + switch (newFormat.image_channel_data_type) { + case CL_SNORM_INT8: + case CL_SNORM_INT16: + case CL_UNORM_INT8: + case CL_UNORM_INT16: + case CL_UNORM_SHORT_565: + case CL_UNORM_SHORT_555: + case CL_UNORM_INT_101010: + case CL_HALF_FLOAT: + case CL_FLOAT: + type = 0; + break; + case CL_SIGNED_INT8: + case CL_SIGNED_INT16: + case CL_SIGNED_INT32: + type = 1; + break; + case CL_UNSIGNED_INT8: + case CL_UNSIGNED_INT16: + case CL_UNSIGNED_INT32: + type = 2; + break; + } + setArgument(kernels_[fillType], 6, sizeof(type), &type); + + // Create ND range object for the kernel's execution + amd::NDRangeContainer ndrange(dim, + globalWorkOffset, globalWorkSize, localWorkSize); + + // Execute the blit + address parameters = kernels_[fillType]->parameters().capture(dev()); + result = gpu().submitKernelInternal(ndrange, *kernels_[fillType], parameters, NULL); + kernels_[fillType]->parameters().release(const_cast
(parameters), dev()); + if (releaseView) { + // todo SRD programming could be changed to avoid a stall + gpu().releaseGpuMemoryFence(); + memView->owner()->release(); + } + + synchronize(); return result; } -} // namespace roc +amd::Memory* +DmaBlitManager::pinHostMemory( + const void* hostMem, + size_t pinSize, + size_t& partial) const +{ + size_t pinAllocSize; + const static bool SysMem = true; + amd::Memory* amdMemory; + + // Allign offset to 4K boundary (Vista/Win7 limitation) + char* tmpHost = const_cast( + amd::alignDown(reinterpret_cast(hostMem), + PinnedMemoryAlignment)); + + // Find the partial size for unaligned copy + partial = reinterpret_cast(hostMem) - tmpHost; + + // Recalculate pin memory size + pinAllocSize = amd::alignUp(pinSize + partial, PinnedMemoryAlignment); + + amdMemory = gpu().findPinnedMem(tmpHost, pinAllocSize); + + if (NULL != amdMemory) { + return amdMemory; + } + + amdMemory = new(*context_) + amd::Buffer(*context_, CL_MEM_USE_HOST_PTR, pinAllocSize); + + if ((amdMemory != NULL) && !amdMemory->create(tmpHost, SysMem)) { + amdMemory->release(); + return NULL; + } + + // Get device memory for this virtual device + // @note: This will force real memory pinning + amdMemory->setVirtualDevice(&gpu()); + Memory* srcMemory = dev().getRocMemory(amdMemory); + + if (srcMemory == NULL) { + // Release all pinned memory and attempt pinning again + gpu().releasePinnedMem(); + srcMemory = dev().getRocMemory(amdMemory); + if (srcMemory == NULL) { + // Release memory + amdMemory->release(); + amdMemory = NULL; + } + } + + return amdMemory; +} + +Memory* +KernelBlitManager::createView( + const Memory& parent, + const cl_image_format format) const +{ + assert((parent.owner()->asBuffer() == nullptr) && "View supports images only"); + amd::Image *image = + parent.owner()->asImage()->createView(parent.owner()->getContext(), format, &gpu()); + + if (image == NULL) { + LogError("[OCL] Fail to allocate view of image object"); + return NULL; + } + + Image* devImage = new roc::Image(dev(), *image); + if (devImage == NULL) { + LogError("[OCL] Fail to allocate device mem object for the view"); + image->release(); + return NULL; + } + + if (!devImage->createView(parent)) { + LogError("[OCL] Fail to create device mem object for the view"); + delete devImage; + image->release(); + return NULL; + } + + image->replaceDeviceMemory(&dev_, devImage); + + return devImage; +} + +} // namespace pal diff --git a/projects/clr/rocclr/runtime/device/rocm/rocblit.hpp b/projects/clr/rocclr/runtime/device/rocm/rocblit.hpp index 16d1ef2363..8891f7170c 100644 --- a/projects/clr/rocclr/runtime/device/rocm/rocblit.hpp +++ b/projects/clr/rocclr/runtime/device/rocm/rocblit.hpp @@ -1,5 +1,5 @@ // -// Copyright (c) 2013 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved. // #pragma once @@ -8,12 +8,13 @@ #include "platform/commandqueue.hpp" #include "device/device.hpp" #include "device/blit.hpp" +#include "device/rocm/rocdefs.hpp" -/*! \addtogroup HSA Blit Implementation +/*! \addtogroup ROC Blit Implementation * @{ */ -//! HSA Blit Manager Implementation +//! ROC Blit Manager Implementation namespace roc { class Device; @@ -22,221 +23,29 @@ class Memory; class VirtualGPU; //! DMA Blit Manager -class HsaBlitManager : public device::HostBlitManager +class DmaBlitManager : public device::HostBlitManager { public: - //! Constructor - HsaBlitManager( - device::VirtualDevice& vdev, //!< Virtual GPU to be used for blits - Setup setup = Setup() //!< Specifies HW accelerated blits - ); - - //! Destructor - virtual ~HsaBlitManager() { - if (completion_signal_.handle != 0) { - hsa_signal_destroy(completion_signal_); - } - } - - //! Creates HostBlitManager object - virtual bool create(amd::Device& device) { - if (HSA_STATUS_SUCCESS != hsa_signal_create(0, 0, NULL, &completion_signal_)) { - return false; - } - return true; - } - - //! Copies a buffer object to system memory - virtual bool readBuffer( - device::Memory& srcMemory, //!< Source memory object - void* dstHost, //!< Destination host memory - const amd::Coord3D& origin, //!< Source origin - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const; - - //! Copies a buffer object to system memory - virtual bool readBufferRect( - device::Memory& srcMemory, //!< Source memory object - void* dstHost, //!< Destinaiton host memory - const amd::BufferRect& bufRect, //!< Source rectangle - const amd::BufferRect& hostRect, //!< Destination rectangle - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const; - - //! Copies an image object to system memory - virtual bool readImage( - device::Memory& srcMemory, //!< Source memory object - void* dstHost, //!< Destination host memory - const amd::Coord3D& origin, //!< Source origin - const amd::Coord3D& size, //!< Size of the copy region - size_t rowPitch, //!< Row pitch for host memory - size_t slicePitch, //!< Slice pitch for host memory - bool entire = false //!< Entire buffer will be updated - ) const; - - //! Copies system memory to a buffer object - virtual bool writeBuffer( - const void* srcHost, //!< Source host memory - device::Memory& dstMemory, //!< Destination memory object - const amd::Coord3D& origin, //!< Destination origin - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const; - - //! Copies system memory to a buffer object - virtual bool writeBufferRect( - const void* srcHost, //!< Source host memory - device::Memory& dstMemory, //!< Destination memory object - const amd::BufferRect& hostRect, //!< Destination rectangle - const amd::BufferRect& bufRect, //!< Source rectangle - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const; - - //! Copies system memory to an image object - virtual bool writeImage( - const void* srcHost, //!< Source host memory - device::Memory& dstMemory, //!< Destination memory object - const amd::Coord3D& origin, //!< Destination origin - const amd::Coord3D& size, //!< Size of the copy region - size_t rowPitch, //!< Row pitch for host memory - size_t slicePitch, //!< Slice pitch for host memory - bool entire = false //!< Entire buffer will be updated - ) const; - - //! Copies a buffer object to another buffer object - virtual bool copyBuffer( - device::Memory& srcMemory, //!< Source memory object - device::Memory& dstMemory, //!< Destination memory object - const amd::Coord3D& srcOrigin, //!< Source origin - const amd::Coord3D& dstOrigin, //!< Destination origin - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const; - - //! Copies a buffer object to another buffer object - virtual bool copyBufferRect( - device::Memory& srcMemory, //!< Source memory object - device::Memory& dstMemory, //!< Destination memory object - const amd::BufferRect& srcRect, //!< Source rectangle - const amd::BufferRect& dstRect, //!< Destination rectangle - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const; - - //! Copies an image object to a buffer object - virtual bool copyImageToBuffer( - device::Memory& srcMemory, //!< Source memory object - device::Memory& dstMemory, //!< Destination memory object - const amd::Coord3D& srcOrigin, //!< Source origin - const amd::Coord3D& dstOrigin, //!< Destination origin - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false, //!< Entire buffer will be updated - size_t rowPitch = 0, //!< Pitch for buffer - size_t slicePitch = 0 //!< Slice for buffer - ) const; - - //! Copies a buffer object to an image object - virtual bool copyBufferToImage( - device::Memory& srcMemory, //!< Source memory object - device::Memory& dstMemory, //!< Destination memory object - const amd::Coord3D& srcOrigin, //!< Source origin - const amd::Coord3D& dstOrigin, //!< Destination origin - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false, //!< Entire buffer will be updated - size_t rowPitch = 0, //!< Pitch for buffer - size_t slicePitch = 0 //!< Slice for buffer - ) const; - - //! Copies an image object to another image object - virtual bool copyImage( - device::Memory& srcMemory, //!< Source memory object - device::Memory& dstMemory, //!< Destination memory object - const amd::Coord3D& srcOrigin, //!< Source origin - const amd::Coord3D& dstOrigin, //!< Destination origin - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const; - - //! Fills a buffer memory with a pattern data - virtual bool fillBuffer( - device::Memory& memory, //!< Memory object to fill with pattern - const void* pattern, //!< Pattern data - size_t patternSize, //!< Pattern size - const amd::Coord3D& origin, //!< Destination origin - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const; - - //! Fills an image memory with a pattern data - virtual bool fillImage( - device::Memory& dstMemory, //!< Memory object to fill with pattern - const void* pattern, //!< Pattern data - const amd::Coord3D& origin, //!< Destination origin - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const; - -protected: - //! Returns the virtual GPU object - VirtualGPU& gpu() const { return static_cast(vDev_); } - -private: - //! Handle of Hsa Device object - const roc::Device& roc_device_; - - hsa_signal_t completion_signal_; - - //! Assits in transferring data from Host to Local or vice versa - //! taking into account the Hsail profile supported by Hsa Agent - bool hsaCopy( - const void *hostSrc, //!< Contains source data to be copied - void *hostDst, //!< Destination buffer address for copying - uint32_t size, //!< Size of data to copy in bytes - bool hostToDev //!< True if data is copied from Host To Device - ) const; - - //! Disable copy constructor - HsaBlitManager(const HsaBlitManager&); - - //! Disable operator= - HsaBlitManager& operator=(const HsaBlitManager&); -}; - -//! Kernel Blit Manager -//class KernelBlitManager : public HsaBlitManager -class KernelBlitManager : public HsaBlitManager -{ -private: - VirtualGPU& gpu() const { return static_cast(vDev_); } -public: - enum { - BlitCopyImage = 0, - BlitCopyImage1DA, - BlitCopyImageToBuffer, - BlitCopyBufferToImage, - BlitCopyBufferRect, - BlitCopyBufferRectAligned, - BlitCopyBuffer, - BlitCopyBufferAligned, - FillBuffer, - FillImage, - BlitTotal - }; - //! Constructor - KernelBlitManager( - device::VirtualDevice& vdev, //!< Virtual GPU to be used for blits - Setup setup = Setup() //!< Specifies HW accelerated blits + DmaBlitManager( + VirtualGPU& gpu, //!< Virtual GPU to be used for blits + Setup setup = Setup() //!< Specifies HW accelerated blits ); //! Destructor - virtual ~KernelBlitManager(); + virtual ~DmaBlitManager() { + if (completion_signal_.handle != 0) { + hsa_signal_destroy(completion_signal_); + } + } - //! Creates HostBlitManager object - virtual bool create(amd::Device& device); + //! Creates DmaBlitManager object + virtual bool create(amd::Device& device) { + if (HSA_STATUS_SUCCESS != hsa_signal_create(0, 0, NULL, &completion_signal_)) { + false; + } + return true; + } //! Copies a buffer object to system memory virtual bool readBuffer( @@ -352,6 +161,230 @@ public: bool entire = false //!< Entire buffer will be updated ) const; +protected: + const static uint MaxPinnedBuffers = 4; + + //! Synchronizes the blit operations if necessary + inline void synchronize() const; + + //! Returns the virtual GPU object + VirtualGPU& gpu() const { return static_cast(vDev_); } + + //! Returns the ROC device object + const Device& dev() const { return static_cast(dev_); }; + + inline Memory& gpuMem(device::Memory& mem) const; + + //! Pins host memory for GPU access + amd::Memory* pinHostMemory( + const void* hostMem, //!< Host memory pointer + size_t pinSize, //!< Host memory size + size_t& partial //!< Extra offset for memory alignment + ) const; + + //! Assits in transferring data from Host to Local or vice versa + //! taking into account the Hsail profile supported by Hsa Agent + bool hsaCopy( + const Memory& srcMemory, + const Memory& dstMemory, + const amd::Coord3D& srcOrigin, + const amd::Coord3D& dstOrigin, + const amd::Coord3D& size, + bool enableCopyRect = false, + bool flushDMA = true) const; + + const size_t MinSizeForPinnedTransfer; + bool completeOperation_; //!< DMA blit manager must complete operation + amd::Context* context_; //!< A dummy context + +private: + + //! Disable copy constructor + DmaBlitManager(const DmaBlitManager&); + + //! Disable operator= + DmaBlitManager& operator=(const DmaBlitManager&); + + //! Reads video memory, using a staged buffer + bool readMemoryStaged( + Memory& srcMemory, //!< Source memory object + void* dstHost, //!< Destination host memory + Memory& xferBuf, //!< Staged buffer for read + size_t origin, //!< Original offset in the source memory + size_t& offset, //!< Offset for the current copy pointer + size_t& totalSize, //!< Total size for copy region + size_t xferSize //!< Transfer size + ) const; + + //! Write into video memory, using a staged buffer + bool writeMemoryStaged( + const void* srcHost, //!< Source host memory + Memory& dstMemory, //!< Destination memory object + Memory& xferBuf, //!< Staged buffer for write + size_t origin, //!< Original offset in the destination memory + size_t& offset, //!< Offset for the current copy pointer + size_t& totalSize, //!< Total size for the copy region + size_t xferSize //!< Transfer size + ) const; + + //! Handle of ROC Device object + hsa_signal_t completion_signal_; + + //! Assits in transferring data from Host to Local or vice versa + //! taking into account the Hsail profile supported by Hsa Agent + bool hsaCopyStaged( + const_address hostSrc, //!< Contains source data to be copied + address hostDst, //!< Destination buffer address for copying + size_t size, //!< Size of data to copy in bytes + address staging, //!< Staging resource + bool hostToDev //!< True if data is copied from Host To Device + ) const; +}; + +//! Kernel Blit Manager +class KernelBlitManager : public DmaBlitManager +{ +public: + enum { + BlitCopyImage = 0, + BlitCopyImage1DA, + BlitCopyImageToBuffer, + BlitCopyBufferToImage, + BlitCopyBufferRect, + BlitCopyBufferRectAligned, + BlitCopyBuffer, + BlitCopyBufferAligned, + FillBuffer, + FillImage, + BlitTotal + }; + + //! Constructor + KernelBlitManager( + VirtualGPU& gpu, //!< Virtual GPU to be used for blits + Setup setup = Setup() //!< Specifies HW accelerated blits + ); + + //! Destructor + virtual ~KernelBlitManager(); + + //! Creates DmaBlitManager object + virtual bool create(amd::Device& device); + + //! Copies a buffer object to another buffer object + virtual bool copyBufferRect( + device::Memory& srcMemory, //!< Source memory object + device::Memory& dstMemory, //!< Destination memory object + const amd::BufferRect& srcRectIn, //!< Source rectangle + const amd::BufferRect& dstRectIn, //!< Destination rectangle + const amd::Coord3D& sizeIn, //!< Size of the copy region + bool entire = false //!< Entire buffer will be updated + ) const; + + //! Copies a buffer object to system memory + virtual bool readBuffer( + device::Memory& srcMemory, //!< Source memory object + void* dstHost, //!< Destination host memory + const amd::Coord3D& origin, //!< Source origin + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false //!< Entire buffer will be updated + ) const; + + //! Copies a buffer object to system memory + virtual bool readBufferRect( + device::Memory& srcMemory, //!< Source memory object + void* dstHost, //!< Destinaiton host memory + const amd::BufferRect& bufRect, //!< Source rectangle + const amd::BufferRect& hostRect, //!< Destination rectangle + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false //!< Entire buffer will be updated + ) const; + + //! Copies system memory to a buffer object + virtual bool writeBuffer( + const void* srcHost, //!< Source host memory + device::Memory& dstMemory, //!< Destination memory object + const amd::Coord3D& origin, //!< Destination origin + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false //!< Entire buffer will be updated + ) const; + + //! Copies system memory to a buffer object + virtual bool writeBufferRect( + const void* srcHost, //!< Source host memory + device::Memory& dstMemory, //!< Destination memory object + const amd::BufferRect& hostRect, //!< Destination rectangle + const amd::BufferRect& bufRect, //!< Source rectangle + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false //!< Entire buffer will be updated + ) const; + + //! Copies a buffer object to an image object + virtual bool copyBuffer( + device::Memory& srcMemory, //!< Source memory object + device::Memory& dstMemory, //!< Destination memory object + const amd::Coord3D& srcOrigin, //!< Source origin + const amd::Coord3D& dstOrigin, //!< Destination origin + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false //!< Entire buffer will be updated + ) const; + + //! Copies a buffer object to an image object + virtual bool copyBufferToImage( + device::Memory& srcMemory, //!< Source memory object + device::Memory& dstMemory, //!< Destination memory object + const amd::Coord3D& srcOrigin, //!< Source origin + const amd::Coord3D& dstOrigin, //!< Destination origin + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false, //!< Entire buffer will be updated + size_t rowPitch = 0, //!< Pitch for buffer + size_t slicePitch = 0 //!< Slice for buffer + ) const; + + //! Copies an image object to a buffer object + virtual bool copyImageToBuffer( + device::Memory& srcMemory, //!< Source memory object + device::Memory& dstMemory, //!< Destination memory object + const amd::Coord3D& srcOrigin, //!< Source origin + const amd::Coord3D& dstOrigin, //!< Destination origin + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false, //!< Entire buffer will be updated + size_t rowPitch = 0, //!< Pitch for buffer + size_t slicePitch = 0 //!< Slice for buffer + ) const; + + //! Copies an image object to another image object + virtual bool copyImage( + device::Memory& srcMemory, //!< Source memory object + device::Memory& dstMemory, //!< Destination memory object + const amd::Coord3D& srcOrigin, //!< Source origin + const amd::Coord3D& dstOrigin, //!< Destination origin + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false //!< Entire buffer will be updated + ) const; + + //! Copies an image object to system memory + virtual bool readImage( + device::Memory& srcMemory, //!< Source memory object + void* dstHost, //!< Destination host memory + const amd::Coord3D& origin, //!< Source origin + const amd::Coord3D& size, //!< Size of the copy region + size_t rowPitch, //!< Row pitch for host memory + size_t slicePitch, //!< Slice pitch for host memory + bool entire = false //!< Entire buffer will be updated + ) const; + + //! Copies system memory to an image object + virtual bool writeImage( + const void* srcHost, //!< Source host memory + device::Memory& dstMemory, //!< Destination memory object + const amd::Coord3D& origin, //!< Destination origin + const amd::Coord3D& size, //!< Size of the copy region + size_t rowPitch, //!< Row pitch for host memory + size_t slicePitch, //!< Slice pitch for host memory + bool entire = false //!< Entire buffer will be updated + ) const; + //! Fills a buffer memory with a pattern data virtual bool fillBuffer( device::Memory& memory, //!< Memory object to fill with pattern @@ -372,26 +405,56 @@ public: ) const; private: - //! Disable copy constructor - KernelBlitManager(const KernelBlitManager&); + static const size_t MaxXferBuffers = 2; + static const uint TransferSplitSize = 1; - //! Disable operator= - KernelBlitManager& operator=(const KernelBlitManager&); + //! Copies a buffer object to an image object + bool copyBufferToImageKernel( + device::Memory& srcMemory, //!< Source memory object + device::Memory& dstMemory, //!< Destination memory object + const amd::Coord3D& srcOrigin, //!< Source origin + const amd::Coord3D& dstOrigin, //!< Destination origin + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false, //!< Entire buffer will be updated + size_t rowPitch = 0, //!< Pitch for buffer + size_t slicePitch = 0 //!< Slice for buffer + ) const; + + //! Copies an image object to a buffer object + bool copyImageToBufferKernel( + device::Memory& srcMemory, //!< Source memory object + device::Memory& dstMemory, //!< Destination memory object + const amd::Coord3D& srcOrigin, //!< Source origin + const amd::Coord3D& dstOrigin, //!< Destination origin + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false, //!< Entire buffer will be updated + size_t rowPitch = 0, //!< Pitch for buffer + size_t slicePitch = 0 //!< Slice for buffer + ) const; //! Creates a program for all blit operations bool createProgram( Device& device //!< Device object ); - amd::Image::Format filterFormat(amd::Image::Format oldFormat) const; + //! Creates a view memory object + Memory* createView( + const Memory& parent, //!< Parent memory object + const cl_image_format format //!< The new format for a view + ) const; - device::Memory *createImageView( - device::Memory &parent, - amd::Image::Format newFormat) const; + //! Disable copy constructor + KernelBlitManager(const KernelBlitManager&); - amd::Context *context_; //!< A dummy context - amd::Program *program_; //!< GPU program obejct - amd::Kernel *kernels_[BlitTotal]; //!< GPU kernels for blit + //! Disable operator= + KernelBlitManager& operator=(const KernelBlitManager&); + + amd::Program* program_; //!< GPU program obejct + amd::Kernel* kernels_[BlitTotal]; //!< GPU kernels for blit + amd::Memory* constantBuffer_; //!< An internal CB for blits + amd::Memory* xferBuffers_[MaxXferBuffers]; //!< Transfer buffers for images + size_t xferBufferSize_; //!< Transfer buffer size + amd::Monitor* lockXferOps_; //!< Lock transfer operation }; static const char* BlitName[KernelBlitManager::BlitTotal] = { @@ -404,9 +467,8 @@ static const char* BlitName[KernelBlitManager::BlitTotal] = { "copyBuffer", "copyBufferAligned", "fillBuffer", - "fillImage" + "fillImage", }; -/*@}*/ -} // namespace roc +/*@}*/} // namespace roc diff --git a/projects/clr/rocclr/runtime/device/rocm/rocdefs.hpp b/projects/clr/rocclr/runtime/device/rocm/rocdefs.hpp index b08349f1e7..26fb001b20 100644 --- a/projects/clr/rocclr/runtime/device/rocm/rocdefs.hpp +++ b/projects/clr/rocclr/runtime/device/rocm/rocdefs.hpp @@ -4,6 +4,9 @@ namespace roc { +//! Alignment restriciton for the pinned memory +const static size_t PinnedMemoryAlignment = 4 * Ki; + typedef uint HsaDeviceId; struct AMDDeviceInfo { diff --git a/projects/clr/rocclr/runtime/device/rocm/rocdevice.cpp b/projects/clr/rocclr/runtime/device/rocm/rocdevice.cpp index c2a17180d4..c784bf1de0 100644 --- a/projects/clr/rocclr/runtime/device/rocm/rocdevice.cpp +++ b/projects/clr/rocclr/runtime/device/rocm/rocdevice.cpp @@ -164,9 +164,7 @@ bool NullDevice::create(const AMDDeviceInfo& deviceInfo) { settings_ = new Settings(); roc::Settings* hsaSettings = static_cast(settings_); - if ((hsaSettings == NULL) || - // @Todo sramalin Use double precision from constsant - !hsaSettings->create((true) & 0x1)) { + if ((hsaSettings == NULL) || !hsaSettings->create(false)) { LogError("Error creating settings for NULL HSA device"); return false; } @@ -189,6 +187,8 @@ Device::Device(hsa_agent_t bkendDevice) , alloc_granularity_(0) , context_(nullptr) , xferQueue_(nullptr) + , xferRead_(nullptr) + , xferWrite_(nullptr) , numOfVgpus_(0) { group_segment_.handle = 0; @@ -208,6 +208,10 @@ Device::~Device() delete mapCache_; delete mapCacheOps_; + // Destroy temporary buffers for read/write + delete xferRead_; + delete xferWrite_; + // Destroy transfer queue if (xferQueue_ && xferQueue_->terminate()) { delete xferQueue_; @@ -363,6 +367,85 @@ Device::loaderQueryHostAddress(const void* device, const void** host) : HSA_STATUS_ERROR; } +Device::XferBuffers::~XferBuffers() +{ + // Destroy temporary buffer for reads + for (const auto& buf : freeBuffers_) { + delete buf; + } + freeBuffers_.clear(); +} + +bool +Device::XferBuffers::create() +{ + Memory* xferBuf = nullptr; + bool result = false; + + // Create a buffer object + xferBuf = new Buffer(dev(), bufSize_); + + // Try to allocate memory for the transfer buffer + if ((nullptr == xferBuf) || !xferBuf->create()) { + delete xferBuf; + xferBuf = nullptr; + LogError("Couldn't allocate a transfer buffer!"); + } + else { + result = true; + freeBuffers_.push_back(xferBuf); + } + + return result; +} + +Memory& +Device::XferBuffers::acquire() +{ + Memory* xferBuf = nullptr; + size_t listSize; + + // Lock the operations with the staged buffer list + amd::ScopedLock l(lock_); + listSize = freeBuffers_.size(); + + // If the list is empty, then attempt to allocate a staged buffer + if (listSize == 0) { + // Allocate memory + xferBuf = new Buffer(dev(), bufSize_); + + // Allocate memory for the transfer buffer + if ((nullptr == xferBuf) || !xferBuf->create()) { + delete xferBuf; + xferBuf = nullptr; + LogError("Couldn't allocate a transfer buffer!"); + } + else { + ++acquiredCnt_; + } + } + + if (xferBuf == nullptr) { + xferBuf = *(freeBuffers_.begin()); + freeBuffers_.erase(freeBuffers_.begin()); + ++acquiredCnt_; + } + + return *xferBuf; +} + +void +Device::XferBuffers::release(VirtualGPU& gpu, Memory& buffer) +{ + // Make sure buffer isn't busy on the current VirtualGPU, because + // the next aquire can come from different queue +// buffer.wait(gpu); + // Lock the operations with the staged buffer list + amd::ScopedLock l(lock_); + freeBuffers_.push_back(&buffer); + --acquiredCnt_; +} + bool Device::init() { #if defined(__linux__) @@ -550,6 +633,28 @@ Device::create() // Use just 1 entry by default for the map cache mapCache_->push_back(NULL); + if (settings().stagedXferSize_ != 0) { + // Initialize staged write buffers + if (settings().stagedXferWrite_) { + xferWrite_ = new XferBuffers(*this, + amd::alignUp(settings().stagedXferSize_, 4 * Ki)); + if ((xferWrite_ == nullptr) || !xferWrite_->create()) { + LogError("Couldn't allocate transfer buffer objects for read"); + return false; + } + } + + // Initialize staged read buffers + if (settings().stagedXferRead_) { + xferRead_ = new XferBuffers(*this, + amd::alignUp(settings().stagedXferSize_, 4 * Ki)); + if ((xferRead_ == nullptr) || !xferRead_->create()) { + LogError("Couldn't allocate transfer buffer objects for write"); + return false; + } + } + } + xferQueue(); return true; @@ -568,11 +673,17 @@ Device::createProgram(amd::option::Options* options) { bool Device::mapHSADeviceToOpenCLDevice(hsa_agent_t dev) { + if (HSA_STATUS_SUCCESS != hsa_agent_get_info(_bkendDevice, + HSA_AGENT_INFO_PROFILE, + &agent_profile_)) { + return false; + } + // Create HSA settings settings_ = new Settings(); roc::Settings* hsaSettings = static_cast(settings_); if ((hsaSettings == NULL) || - !hsaSettings->create((true) & 0x1)) { + !hsaSettings->create((agent_profile_ == HSA_PROFILE_FULL))) { return false; } @@ -712,12 +823,6 @@ Device::populateOCLDeviceConstants() ::strcpy(info_.boardName_, device_name); } - if (HSA_STATUS_SUCCESS != hsa_agent_get_info(_bkendDevice, - HSA_AGENT_INFO_PROFILE, - &agent_profile_)) { - return false; - } - if (HSA_STATUS_SUCCESS != hsa_agent_get_info( _bkendDevice, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, @@ -883,7 +988,7 @@ Device::populateOCLDeviceConstants() if (agent_profile_ == HSA_PROFILE_FULL) { // full-profile = participating in coherent memory, // base-profile = NUMA based non-coherent memory - info_.hostUnifiedMemory_ = CL_TRUE; + info_.hostUnifiedMemory_ = CL_TRUE; } info_.memBaseAddrAlign_ = 8 * (flagIsDefault(MEMOBJ_BASE_ADDR_ALIGN) ? sizeof(cl_long16) : MEMOBJ_BASE_ADDR_ALIGN); @@ -1244,6 +1349,13 @@ Device::addMapTarget(amd::Memory* memory) const return true; } +Memory* +Device::getRocMemory(amd::Memory* mem) const +{ + return static_cast(mem->getDeviceMemory(*this)); +} + + device::Memory* Device::createMemory(amd::Memory &owner) const { @@ -1302,9 +1414,9 @@ Device::createMemory(amd::Memory &owner) const imageView->replaceDeviceMemory(this, devImageView); result = xferMgr().writeImage(owner.getHostMem(), *devImageView, - amd::Coord3D(0), imageView->getRegion(), - imageView->getRowPitch(), - imageView->getSlicePitch(), true); + amd::Coord3D(0, 0, 0), imageView->getRegion(), + 0, + 0, true); imageView->release(); } diff --git a/projects/clr/rocclr/runtime/device/rocm/rocdevice.hpp b/projects/clr/rocclr/runtime/device/rocm/rocdevice.hpp index cd6b4505dd..d6c5315eeb 100644 --- a/projects/clr/rocclr/runtime/device/rocm/rocdevice.hpp +++ b/projects/clr/rocclr/runtime/device/rocm/rocdevice.hpp @@ -219,6 +219,54 @@ private: //! A HSA device ordinal (physical HSA device) class Device : public NullDevice { public: + //! Transfer buffers + class XferBuffers : public amd::HeapObject + { + public: + static const size_t MaxXferBufListSize = 8; + + //! Default constructor + XferBuffers(const Device& device, size_t bufSize) + : bufSize_(bufSize) + , acquiredCnt_(0) + , gpuDevice_(device) + {} + + //! Default destructor + ~XferBuffers(); + + //! Creates the xfer buffers object + bool create(); + + //! Acquires an instance of the transfer buffers + Memory& acquire(); + + //! Releases transfer buffer + void release( + VirtualGPU& gpu, //!< Virual GPU object used with the buffer + Memory& buffer //!< Transfer buffer for release + ); + + //! Returns the buffer's size for transfer + size_t bufSize() const { return bufSize_; } + + private: + //! Disable copy constructor + XferBuffers(const XferBuffers&); + + //! Disable assignment operator + XferBuffers& operator=(const XferBuffers&); + + //! Get device object + const Device& dev() const { return gpuDevice_; } + + size_t bufSize_; //!< Staged buffer size + std::list freeBuffers_; //!< The list of free buffers + amd::Atomic acquiredCnt_; //!< The total number of acquired buffers + amd::Monitor lock_; //!< Stgaed buffer acquire/release lock + const Device& gpuDevice_; //!< GPU device object + }; + //! Initialise the whole HSA device subsystem (CAL init, device enumeration, etc). static bool init(); static void tearDown(); @@ -354,6 +402,17 @@ public: //! Adds a map target to the cache bool addMapTarget(amd::Memory* memory) const; + //! Returns transfer buffer object + XferBuffers& xferWrite() const { return *xferWrite_; } + + //! Returns transfer buffer object + XferBuffers& xferRead() const { return *xferRead_; } + + //! Returns a ROC memory object from AMD memory object + roc::Memory* getRocMemory( + amd::Memory* mem //!< Pointer to AMD memory object + ) const; + private: static hsa_ven_amd_loader_1_00_pfn_t amd_loader_ext_table; @@ -379,6 +438,9 @@ private: VirtualGPU* xferQueue() const; + XferBuffers* xferRead_; //!< Transfer buffers read + XferBuffers* xferWrite_; //!< Transfer buffers write + public: amd::Atomic numOfVgpus_; //!< Virtual gpu unique index }; // class roc::Device diff --git a/projects/clr/rocclr/runtime/device/rocm/rocmemory.cpp b/projects/clr/rocclr/runtime/device/rocm/rocmemory.cpp index bc614e85e4..6a7ce7756f 100644 --- a/projects/clr/rocclr/runtime/device/rocm/rocmemory.cpp +++ b/projects/clr/rocclr/runtime/device/rocm/rocmemory.cpp @@ -25,10 +25,18 @@ namespace roc { /////////////////////////////////roc::Memory////////////////////////////// Memory::Memory(const roc::Device &dev, amd::Memory &owner) - : device::Memory(owner), - dev_(dev), - deviceMemory_(NULL), - kind_(MEMORY_KIND_NORMAL) + : device::Memory(owner) + , dev_(dev) + , deviceMemory_(NULL) + , kind_(MEMORY_KIND_NORMAL) +{ +} + +Memory::Memory(const roc::Device &dev, size_t size) + : device::Memory(size) + , dev_(dev) + , deviceMemory_(NULL) + , kind_(MEMORY_KIND_NORMAL) { } @@ -64,8 +72,8 @@ Memory::allocateMapMemory(size_t allocationSize) roc::Memory* hsaMapMemory = reinterpret_cast( mapMemory->getDeviceMemory(dev_)); if (hsaMapMemory == nullptr) { - mapMemory->release(); - return false; + mapMemory->release(); + return false; } } @@ -191,7 +199,7 @@ bool Memory::createInteropBuffer(GLenum targetType, int miplevel, size_t* metada return false; #else assert(owner()->isInterop() && "Object is not an interop object."); - + mesa_glinterop_export_in in; mesa_glinterop_export_out out; @@ -213,7 +221,7 @@ bool Memory::createInteropBuffer(GLenum targetType, int miplevel, size_t* metada if(!dev_.mesa().Export(in, out)) return false; - + size_t size; hsa_agent_t agent=dev_.getBackendDevice(); hsa_status_t status=hsa_amd_interop_map_buffer(1, &agent, out.dmabuf_fd, 0, &size, &deviceMemory_, metadata_size, (const void**)metadata); @@ -242,9 +250,18 @@ Buffer::Buffer(const roc::Device &dev, amd::Memory &owner) : roc::Memory(dev, owner) {} +Buffer::Buffer(const roc::Device &dev, size_t size) + : roc::Memory(dev, size) +{} + Buffer::~Buffer() { - destroy(); + if (owner() == nullptr) { + dev_.hostFree(deviceMemory_, size()); + } + else { + destroy(); + } } void @@ -288,6 +305,15 @@ Buffer::destroy() bool Buffer::create() { + if (owner() == nullptr) { + deviceMemory_ = dev_.hostAlloc(size(), 1, false); + if (deviceMemory_ != nullptr) { + flags_ |= HostMemoryDirectAccess; + return true; + } + return false; + } + //Interop buffer if(owner()->isInterop()) return createInteropBuffer(GL_ARRAY_BUFFER, 0, NULL, NULL); @@ -303,8 +329,7 @@ Buffer::create() } const size_t offset = owner()->getOrigin(); - deviceMemory_ = - static_cast(parentBuffer->getDeviceMemory()) + offset; + deviceMemory_ = parentBuffer->getDeviceMemory() + offset; flags_ |= SubMemoryObject; flags_ |= @@ -562,10 +587,10 @@ Image::createInteropImage() { auto obj=owner()->getInteropObj()->asGLObject(); assert(obj->getCLGLObjectType()!=CL_GL_OBJECT_BUFFER && "Non-image OpenGL object used with interop image API."); - + const hsa_amd_image_descriptor_t* meta; size_t size=0; - + GLenum glTarget = obj->getGLTarget(); if (glTarget == GL_TEXTURE_CUBE_MAP) { glTarget = obj->getCubemapFace(); @@ -593,13 +618,13 @@ Image::createInteropImage() if (obj->getGLTarget()==GL_TEXTURE_CUBE_MAP) desc.setFace(obj->getCubemapFace()); - + originalDeviceMemory_=deviceMemory_; hsa_status_t err=hsa_amd_image_create(dev_.getBackendDevice(), &imageDescriptor_, amdImageDesc_, originalDeviceMemory_, permission_, &hsaImageObject_); if(err!=HSA_STATUS_SUCCESS) return false; - + BufferGuard.Dismiss(); DescGuard.Dismiss(); return true; @@ -672,13 +697,13 @@ Image::create() } bool -Image::createView(Memory &parent) +Image::createView(const Memory &parent) { deviceMemory_ = parent.getDeviceMemory(); originalDeviceMemory_ = (parent.owner()->asBuffer() != NULL) ? deviceMemory_ - : static_cast(parent).originalDeviceMemory_; + : static_cast(parent).originalDeviceMemory_; kind_=parent.getKind(); diff --git a/projects/clr/rocclr/runtime/device/rocm/rocmemory.hpp b/projects/clr/rocclr/runtime/device/rocm/rocmemory.hpp index c2d77f6201..92e945cb13 100644 --- a/projects/clr/rocclr/runtime/device/rocm/rocmemory.hpp +++ b/projects/clr/rocclr/runtime/device/rocm/rocmemory.hpp @@ -18,10 +18,12 @@ class Memory : public device::Memory { Memory(const roc::Device &dev, amd::Memory &owner); + Memory(const roc::Device &dev, size_t size); + virtual ~Memory(); - // Getter for deviceMemory_. - void *getDeviceMemory() const { return deviceMemory_; } + // Getter for deviceMemory_ + address getDeviceMemory() const { return reinterpret_cast
(deviceMemory_); } // Gets a pointer to a region of host-visible memory for use as the target // of an indirect map for a given memory object @@ -41,7 +43,7 @@ class Memory : public device::Memory { Unimplemented(); return true; } - + // Immediate blocking write from device cache to owners's backing store. // Marks owner as "current" by resetting the last writer to NULL. virtual void syncHostFromCache(SyncFlags syncFlags = SyncFlags()) @@ -112,6 +114,7 @@ class Memory : public device::Memory { class Buffer : public roc::Memory { public: Buffer(const roc::Device &dev, amd::Memory &owner); + Buffer(const roc::Device &dev, size_t size); virtual ~Buffer(); @@ -143,7 +146,7 @@ public: virtual bool create(); //! Create an image view - bool createView(Memory &parent); + bool createView(const Memory &parent); //! Gets a pointer to a region of host-visible memory for use as the target //! of an indirect map for a given memory object diff --git a/projects/clr/rocclr/runtime/device/rocm/rocsettings.cpp b/projects/clr/rocclr/runtime/device/rocm/rocsettings.cpp index d3cb3a0b17..cbdccdc700 100644 --- a/projects/clr/rocclr/runtime/device/rocm/rocsettings.cpp +++ b/projects/clr/rocclr/runtime/device/rocm/rocsettings.cpp @@ -53,14 +53,38 @@ Settings::Settings() enablePartialDispatch_ = (partialDispatch) ? false : true; partialDispatch_ = (partialDispatch) ? false : true; commandQueues_ = 100; //!< Field value set to maximum number - //!< concurrent Virtual GPUs for ROCm backend + //!< concurrent Virtual GPUs for ROCm backend + + // Disable image DMA by default (ROCM runtime doesn't support it) + imageDMA_ = false; + + stagedXferRead_ = true; + stagedXferWrite_ = true; + stagedXferSize_ = GPU_STAGING_BUFFER_SIZE * Ki; + + // Initialize transfer buffer size to 1MB by default + xferBufSize_ = 1024 * Ki; + + const static size_t MaxPinnedXferSize = 32; + pinnedXferSize_ = std::min(GPU_PINNED_XFER_SIZE, MaxPinnedXferSize) * Mi; + pinnedMinXferSize_ = std::min(GPU_PINNED_MIN_XFER_SIZE * Ki, pinnedXferSize_); } bool -Settings::create(bool doublePrecision) +Settings::create(bool fullProfile) { customHostAllocator_ = true; + if (fullProfile) { + pinnedXferSize_ = 0; + stagedXferSize_ = 0; + xferBufSize_ = 0; + } + else { + pinnedXferSize_ = std::max(pinnedXferSize_, pinnedMinXferSize_); + stagedXferSize_ = std::max(stagedXferSize_, pinnedMinXferSize_ + 4 * Ki); + } + // Enable extensions enableExtension(ClKhrByteAddressableStore); enableExtension(ClKhrGlobalInt32BaseAtomics); @@ -72,21 +96,16 @@ Settings::create(bool doublePrecision) enableExtension(ClKhr3DImageWrites); enableExtension(ClAmdMediaOps); enableExtension(ClAmdMediaOps2); - if(MesaInterop::Supported()) - enableExtension(ClKhrGlSharing); - - // Make sure device supports doubles - doublePrecision_ &= doublePrecision; - - if (doublePrecision_) { - // Enable KHR double precision extension - enableExtension(ClKhrFp64); -#if !defined(WITH_LIGHTNING_COMPILER) - // Also enable AMD double precision extension? - enableExtension(ClAmdFp64); -#endif // !defined(WITH_LIGHTNING_COMPILER) + if(MesaInterop::Supported()) { + enableExtension(ClKhrGlSharing); } + // Enable KHR double precision extension + enableExtension(ClKhrFp64); +#if !defined(WITH_LIGHTNING_COMPILER) + // Also enable AMD double precision extension? + enableExtension(ClAmdFp64); +#endif // !defined(WITH_LIGHTNING_COMPILER) enableExtension(ClKhrSubGroups); enableExtension(ClKhrDepthImages); @@ -109,6 +128,18 @@ Settings::override() if (!flagIsDefault(GPU_MAX_COMMAND_QUEUES)) { commandQueues_ = GPU_MAX_COMMAND_QUEUES; } + + if (!flagIsDefault(GPU_IMAGE_DMA)) { + commandQueues_ = GPU_IMAGE_DMA; + } + + if (!flagIsDefault(GPU_XFER_BUFFER_SIZE)) { + xferBufSize_ = GPU_XFER_BUFFER_SIZE * Ki; + } + + if (!flagIsDefault(GPU_PINNED_MIN_XFER_SIZE)) { + pinnedMinXferSize_ = std::min(GPU_PINNED_MIN_XFER_SIZE * Ki, pinnedXferSize_); + } } } // namespace roc diff --git a/projects/clr/rocclr/runtime/device/rocm/rocsettings.hpp b/projects/clr/rocclr/runtime/device/rocm/rocsettings.hpp index fc716584ad..4e1f9400f8 100644 --- a/projects/clr/rocclr/runtime/device/rocm/rocsettings.hpp +++ b/projects/clr/rocclr/runtime/device/rocm/rocsettings.hpp @@ -26,7 +26,10 @@ public: uint enableImageHandle_: 1; //!< Use HSAIL image/sampler pointer uint enableNCMode_: 1; //!< Enable Non Coherent mode for system memory uint enablePartialDispatch_: 1; //!< Enable support for Partial Dispatch - uint reserved_: 26; + uint imageDMA_: 1; //!< Enable direct image DMA transfers + uint stagedXferRead_: 1; //!< Uses a staged buffer read + uint stagedXferWrite_: 1; //!< Uses a staged buffer write + uint reserved_: 22; }; uint value_; }; @@ -46,11 +49,16 @@ public: uint kernargPoolSize_; uint signalPoolSize_; + size_t xferBufSize_; //!< Transfer buffer size for image copy optimization + size_t stagedXferSize_; //!< Staged buffer size + size_t pinnedXferSize_; //!< Pinned buffer size for transfer + size_t pinnedMinXferSize_; //!< Minimal buffer size for pinned transfer + //! Default constructor Settings(); //! Creates settings - bool create(bool doublePrecision); + bool create(bool fullProfile); private: //! Disable copy constructor diff --git a/projects/clr/rocclr/runtime/device/rocm/rocvirtual.cpp b/projects/clr/rocclr/runtime/device/rocm/rocvirtual.cpp index 06735ca902..6fe28dea08 100644 --- a/projects/clr/rocclr/runtime/device/rocm/rocvirtual.cpp +++ b/projects/clr/rocclr/runtime/device/rocm/rocvirtual.cpp @@ -446,6 +446,9 @@ bool VirtualGPU::releaseGpuMemoryFence() { hasPendingDispatch_ = false; + // Release all transfer buffers on this command queue + releaseXferWrite(); + // Release all memory dependencies memoryDependency().clear(); @@ -1774,8 +1777,66 @@ void VirtualGPU::submitReleaseExtObjects(amd::ReleaseExtObjectsCommand& vcmd) profilingEnd(vcmd); } -void VirtualGPU::flush(amd::Command *list, bool wait) { +void VirtualGPU::flush(amd::Command *list, bool wait) +{ releaseGpuMemoryFence(); updateCommandsState(list); + // Rlease all pinned memory + releasePinnedMem(); +} + +void +VirtualGPU::addXferWrite(Memory& memory) +{ + if (xferWriteBuffers_.size() > 7) { + dev().xferWrite().release(*this, *xferWriteBuffers_.front()); + xferWriteBuffers_.erase(xferWriteBuffers_.begin()); + } + + // Delay destruction + xferWriteBuffers_.push_back(&memory); +} + +void +VirtualGPU::releaseXferWrite() +{ + for (auto& memory : xferWriteBuffers_) { + dev().xferWrite().release(*this, *memory); + } + xferWriteBuffers_.resize(0); +} + +void +VirtualGPU::addPinnedMem(amd::Memory* mem) +{ + if (nullptr == findPinnedMem(mem->getHostMem(), mem->getSize())) { + if (pinnedMems_.size() > 7) { + pinnedMems_.front()->release(); + pinnedMems_.erase(pinnedMems_.begin()); + } + + // Delay destruction + pinnedMems_.push_back(mem); + } +} + +void +VirtualGPU::releasePinnedMem() +{ + for (auto& amdMemory : pinnedMems_) { + amdMemory->release(); + } + pinnedMems_.resize(0); +} + +amd::Memory* +VirtualGPU::findPinnedMem(void* addr, size_t size) +{ + for (auto& amdMemory : pinnedMems_) { + if ((amdMemory->getHostMem() == addr) && (size <= amdMemory->getSize())) { + return amdMemory; + } + } + return nullptr; } } // End of roc namespace diff --git a/projects/clr/rocclr/runtime/device/rocm/rocvirtual.hpp b/projects/clr/rocclr/runtime/device/rocm/rocvirtual.hpp index 8ff19db976..bbf29929f6 100644 --- a/projects/clr/rocclr/runtime/device/rocm/rocvirtual.hpp +++ b/projects/clr/rocclr/runtime/device/rocm/rocvirtual.hpp @@ -149,7 +149,7 @@ public: void submitAcquireExtObjects(amd::AcquireExtObjectsCommand& cmd); void submitReleaseExtObjects(amd::ReleaseExtObjectsCommand& cmd); void submitPerfCounter(amd::PerfCounterCommand& cmd){}; - + void flush(amd::Command* list = NULL, bool wait = false); void submitFillMemory(amd::FillMemoryCommand& cmd); void submitMigrateMemObjects(amd::MigrateMemObjectsCommand& cmd); @@ -193,10 +193,24 @@ public: bool processMemObjects( const amd::Kernel& kernel, //!< AMD kernel object for execution const_address params //!< Pointer to the param's store - ); + ); //Retun the virtual gpu unique index uint index() const { return index_; } + //! Adds a stage write buffer into a list + void addXferWrite(Memory& memory); + + //! Releases stage write buffers + void releaseXferWrite(); + + //! Adds a pinned memory object into a map + void addPinnedMem(amd::Memory* mem); + + //! Release pinned memory objects + void releasePinnedMem(); + + //! Finds if pinned memory is cached + amd::Memory* findPinnedMem(void* addr, size_t size); // } roc OpenCL integration private: @@ -219,6 +233,9 @@ private: //! Updates AQL header for the upcomming dispatch void setAqlHeader(uint16_t header) { aqlHeader_ = header; } + std::vector xferWriteBuffers_; //!< Stage write buffers + std::vector pinnedMems_; //!< Pinned memory list + /** * @brief Maintains the list of sampler allocated for one or more kernel * submissions. @@ -231,16 +248,16 @@ private: */ bool hasPendingDispatch_; Timestamp* timestamp_; - hsa_agent_t gpu_device_; //!< Physical device - hsa_queue_t* gpu_queue_; //!< Queue associated with a gpu + hsa_agent_t gpu_device_; //!< Physical device + hsa_queue_t* gpu_queue_; //!< Queue associated with a gpu hsa_barrier_and_packet_t barrier_packet_; hsa_signal_t barrier_signal_; - uint32_t dispatch_id_; //!< This variable must be updated atomically. - Device& roc_device_; //!< roc device object + uint32_t dispatch_id_; //!< This variable must be updated atomically. + Device& roc_device_; //!< roc device object void * tools_lib_; PrintfDbg* printfdbg_; MemoryDependency memoryDependency_; //!< Memory dependency class - uint16_t aqlHeader_; //!< AQL header for dispatch + uint16_t aqlHeader_; //!< AQL header for dispatch char* kernarg_pool_base_; size_t kernarg_pool_size_;