From 8b5893628268cf56360fb7918e8feb8d6498c896 Mon Sep 17 00:00:00 2001
From: foreman
Date: Mon, 23 Jan 2017 11:59:51 -0500
Subject: [PATCH] P4 to Git Change 1364923 by gandryey@gera-lnx-rcf-lc on
2017/01/23 11:48:45
SWDEV-110996 - OCL to use the blit manager instead ROCr implementing copyRect API
- Implement the blit manager functionality in ROCm backened. This checki-in also fixes SWDEV-95079, SWDEV-95068, SWDEV-95069, SWDEV-95071
Affected files ...
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocblit.cpp#6 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocblit.hpp#4 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocdefs.hpp#9 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocdevice.cpp#35 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocdevice.hpp#13 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocmemory.cpp#6 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocmemory.hpp#5 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocsettings.cpp#13 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocsettings.hpp#6 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocvirtual.cpp#27 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocvirtual.hpp#6 edit
[ROCm/clr commit: 454621b7f12eebc2edacaefff91c47442e51c35f]
---
.../rocclr/runtime/device/rocm/rocblit.cpp | 3393 +++++++++++------
.../rocclr/runtime/device/rocm/rocblit.hpp | 512 +--
.../rocclr/runtime/device/rocm/rocdefs.hpp | 3 +
.../rocclr/runtime/device/rocm/rocdevice.cpp | 140 +-
.../rocclr/runtime/device/rocm/rocdevice.hpp | 62 +
.../rocclr/runtime/device/rocm/rocmemory.cpp | 59 +-
.../rocclr/runtime/device/rocm/rocmemory.hpp | 11 +-
.../runtime/device/rocm/rocsettings.cpp | 61 +-
.../runtime/device/rocm/rocsettings.hpp | 12 +-
.../rocclr/runtime/device/rocm/rocvirtual.cpp | 63 +-
.../rocclr/runtime/device/rocm/rocvirtual.hpp | 31 +-
11 files changed, 2820 insertions(+), 1527 deletions(-)
diff --git a/projects/clr/rocclr/runtime/device/rocm/rocblit.cpp b/projects/clr/rocclr/runtime/device/rocm/rocblit.cpp
index 83c6988a7d..a557565ee8 100644
--- a/projects/clr/rocclr/runtime/device/rocm/rocblit.cpp
+++ b/projects/clr/rocclr/runtime/device/rocm/rocblit.cpp
@@ -1,528 +1,996 @@
//
-// Copyright (c) 2013 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
//
-
#include "platform/commandqueue.hpp"
#include "device/rocm/rocdevice.hpp"
#include "device/rocm/rocblit.hpp"
#include "device/rocm/rocmemory.hpp"
#include "device/rocm/rocvirtual.hpp"
#include "utils/debug.hpp"
+#include
namespace roc {
-
-void
-FindPinSize(
- size_t& pinSize, const amd::Coord3D& size,
- size_t& rowPitch, size_t& slicePitch, const Image& image)
+DmaBlitManager::DmaBlitManager(VirtualGPU& gpu, Setup setup)
+ : HostBlitManager(gpu, setup)
+ , MinSizeForPinnedTransfer(dev().settings().pinnedMinXferSize_)
+ , completeOperation_(false)
+ , context_(NULL)
{
- size_t elementSize = image.owner()->asImage()->getImageFormat().getElementSize();
- pinSize = size[0] * elementSize;
- if ((rowPitch == 0) || (rowPitch == pinSize)) {
- rowPitch = 0;
+}
+
+inline void
+DmaBlitManager::synchronize() const
+{
+ gpu().releaseGpuMemoryFence();
+
+ if (syncOperation_) {
+// gpu().waitAllEngines();
+ }
+}
+
+inline Memory&
+DmaBlitManager::gpuMem(device::Memory& mem) const
+{
+ return static_cast(mem);
+}
+
+bool
+DmaBlitManager::readMemoryStaged(
+ Memory& srcMemory,
+ void* dstHost,
+ Memory& xferBuf,
+ size_t origin,
+ size_t& offset,
+ size_t& totalSize,
+ size_t xferSize) const
+{
+ const_address src = srcMemory.getDeviceMemory();
+ address staging = xferBuf.getDeviceMemory();
+
+ // Copy data from device to host
+ src += origin + offset;
+ address dst = reinterpret_cast(dstHost) + offset;
+ bool ret = hsaCopyStaged(src, dst, totalSize, staging, false);
+
+ return ret;
+}
+
+bool
+DmaBlitManager::readBuffer(
+ device::Memory& srcMemory,
+ void* dstHost,
+ const amd::Coord3D& origin,
+ const amd::Coord3D& size,
+ bool entire) const
+{
+ // Use host copy if memory has direct access
+ if (setup_.disableReadBuffer_ || gpuMem(srcMemory).isHostMemDirectAccess()) {
+ return HostBlitManager::readBuffer(
+ srcMemory, dstHost, origin, size, entire);
}
else {
- pinSize = rowPitch;
- }
+ size_t srcSize = size[0];
+ size_t offset = 0;
+ size_t pinSize = dev().settings().pinnedXferSize_;
+ pinSize = std::min(pinSize, srcSize);
- // Calculate the pin size, which should be equal to the copy size
- for (uint i = 1; i < 3; ++i) {
- pinSize *= size[i];
- if (i == 1) {
- if ((slicePitch == 0) || (slicePitch == pinSize)) {
- slicePitch = 0;
- }
- else {
- if (image.getHsaImageDescriptor().geometry != HSA_EXT_IMAGE_GEOMETRY_1DA) {
- pinSize = slicePitch;
+ // Check if a pinned transfer can be executed
+ if (pinSize && (srcSize > MinSizeForPinnedTransfer)) {
+ // Allign offset to 4K boundary (Vista/Win7 limitation)
+ char* tmpHost = const_cast(
+ amd::alignDown(reinterpret_cast(dstHost),
+ PinnedMemoryAlignment));
+
+ // Find the partial size for unaligned copy
+ size_t partial = reinterpret_cast(dstHost) - tmpHost;
+
+ amd::Memory* pinned = NULL;
+ bool first = true;
+ size_t tmpSize;
+ size_t pinAllocSize;
+
+ // Copy memory, using pinning
+ while (srcSize > 0) {
+ // If it's the first iterarion, then readjust the copy size
+ // to include alignment
+ if (first) {
+ pinAllocSize = amd::alignUp(pinSize + partial,
+ PinnedMemoryAlignment);
+ tmpSize = std::min(pinAllocSize - partial, srcSize);
+ first = false;
}
else {
- pinSize = slicePitch * size[i];
+ tmpSize = std::min(pinSize, srcSize);
+ pinAllocSize = amd::alignUp(tmpSize, PinnedMemoryAlignment);
+ partial = 0;
+ }
+ amd::Coord3D dst(partial, 0, 0);
+ amd::Coord3D srcPin(origin[0] + offset, 0, 0);
+ amd::Coord3D copySizePin(tmpSize, 0, 0);
+ size_t partial2;
+
+ // Allocate a GPU resource for pinning
+ pinned = pinHostMemory(tmpHost, pinAllocSize, partial2);
+ if (pinned != NULL) {
+ // Get device memory for this virtual device
+ Memory* dstMemory = dev().getRocMemory(pinned);
+
+ if (!hsaCopy(gpuMem(srcMemory), *dstMemory,
+ srcPin, dst, copySizePin)) {
+ LogWarning("DmaBlitManager::readBuffer failed a pinned copy!");
+ gpu().addPinnedMem(pinned);
+ break;
+ }
+ gpu().addPinnedMem(pinned);
+ }
+ else {
+ LogWarning("DmaBlitManager::readBuffer failed to pin a resource!");
+ break;
+ }
+ srcSize -= tmpSize;
+ offset += tmpSize;
+ tmpHost = reinterpret_cast(tmpHost) + tmpSize + partial;
+ }
+ }
+
+ if (0 != srcSize) {
+ Memory& xferBuf = dev().xferRead().acquire();
+
+ // Read memory using a staging resource
+ if (!readMemoryStaged(gpuMem(srcMemory), dstHost, xferBuf, origin[0],
+ offset, srcSize, srcSize)) {
+ LogError("DmaBlitManager::readBuffer failed!");
+ return false;
+ }
+
+ dev().xferRead().release(gpu(), xferBuf);
+ }
+ }
+
+ return true;
+}
+
+bool
+DmaBlitManager::readBufferRect(
+ device::Memory& srcMemory,
+ void* dstHost,
+ const amd::BufferRect& bufRect,
+ const amd::BufferRect& hostRect,
+ const amd::Coord3D& size,
+ bool entire) const
+{
+ // Use host copy if memory has direct access
+ if (setup_.disableReadBufferRect_ || gpuMem(srcMemory).isHostMemDirectAccess()) {
+ return HostBlitManager::readBufferRect(
+ srcMemory, dstHost, bufRect, hostRect, size, entire);
+ }
+ else {
+ Memory& xferBuf = dev().xferRead().acquire();
+ address staging = xferBuf.getDeviceMemory();
+ const_address src = gpuMem(srcMemory).getDeviceMemory();
+
+ size_t srcOffset;
+ size_t dstOffset;
+
+ for (size_t z = 0; z < size[2]; ++z) {
+ for (size_t y = 0; y < size[1]; ++y) {
+ srcOffset = bufRect.offset(0, y, z);
+ dstOffset = hostRect.offset(0, y, z);
+
+ // Copy data from device to host - line by line
+ address dst = reinterpret_cast(dstHost) + dstOffset;
+ src += srcOffset;
+ bool retval = hsaCopyStaged(src, dst, size[0], staging, false);
+ if (!retval) {
+ return retval;
}
}
}
+ dev().xferRead().release(gpu(), xferBuf);
}
+
+ return true;
}
-HsaBlitManager::HsaBlitManager(device::VirtualDevice& vDev, Setup setup)
- : HostBlitManager(vDev, setup),
- roc_device_(reinterpret_cast(dev_)) {
- completion_signal_.handle = 0;
-}
-
-bool HsaBlitManager::hsaCopy(const void *hostSrc, void *hostDst,
- uint32_t size, bool hostToDev) const {
-
- // No allocation is necessary for Full Profile
- hsa_status_t status;
- if (roc_device_.agent_profile() == HSA_PROFILE_FULL) {
- status = hsa_memory_copy(hostDst, hostSrc, size);
- if (status != HSA_STATUS_SUCCESS) {
- LogPrintfError("Hsa copy of data failed with code %d", status);
- }
- return (status == HSA_STATUS_SUCCESS);
- }
-
- // Allocate requested size of memory
- size_t align = 0x04;
- bool atomics = false;
- void *hsaBuffer = NULL;
- hsaBuffer = roc_device_.hostAlloc(size, align, false);
- if (hsaBuffer == NULL) {
- LogError("Hsa buffer allocation failed with code");
- return false;
- }
-
- const hsa_signal_value_t kInitVal = 1;
- hsa_signal_store_relaxed(completion_signal_, kInitVal);
-
- // Copy data from Host to Device
- if (hostToDev) {
- memcpy(hsaBuffer, hostSrc, size);
- status = hsa_amd_memory_async_copy(
- hostDst, roc_device_.getBackendDevice(), hsaBuffer,
- roc_device_.getCpuAgent(), size, 0, NULL, completion_signal_);
- if (status == HSA_STATUS_SUCCESS) {
- hsa_signal_value_t val =
- hsa_signal_wait_acquire(completion_signal_, HSA_SIGNAL_CONDITION_EQ, 0,
- uint64_t(-1), HSA_WAIT_STATE_ACTIVE);
-
- if (val != (kInitVal - 1)) {
- LogError("Async copy failed");
- status = HSA_STATUS_ERROR;
- }
+bool
+DmaBlitManager::readImage(
+ device::Memory& srcMemory,
+ void* dstHost,
+ const amd::Coord3D& origin,
+ const amd::Coord3D& size,
+ size_t rowPitch,
+ size_t slicePitch,
+ bool entire) const
+{
+ if (setup_.disableReadImage_) {
+ return HostBlitManager::readImage(srcMemory, dstHost,
+ origin, size, rowPitch, slicePitch, entire);
}
else {
- LogPrintfError("Hsa copy from host to device failed with code %d", status);
+ //! @todo Add HW accelerated path
+ return HostBlitManager::readImage(srcMemory, dstHost,
+ origin, size, rowPitch, slicePitch, entire);
}
- roc_device_.hostFree(hsaBuffer, size);
- return (status == HSA_STATUS_SUCCESS);
- }
+ return true;
+}
- // Copy data from Device to Host
- status = hsa_amd_memory_async_copy(hsaBuffer, roc_device_.getCpuAgent(),
- hostSrc, roc_device_.getBackendDevice(),
- size, 0, NULL, completion_signal_);
- if (status == HSA_STATUS_SUCCESS) {
- hsa_signal_value_t val = hsa_signal_wait_acquire(
- completion_signal_, HSA_SIGNAL_CONDITION_EQ, 0, uint64_t(-1),
- HSA_WAIT_STATE_ACTIVE);
+bool
+DmaBlitManager::writeMemoryStaged(
+ const void* srcHost,
+ Memory& dstMemory,
+ Memory& xferBuf,
+ size_t origin,
+ size_t& offset,
+ size_t& totalSize,
+ size_t xferSize) const
+{
+ address dst = dstMemory.getDeviceMemory();
+ address staging = xferBuf.getDeviceMemory();
- if (val != (kInitVal - 1)) {
- LogError("Async copy failed");
- status = HSA_STATUS_ERROR;
+ // Copy data from host to device
+ dst += origin + offset;
+ const_address src = reinterpret_cast(srcHost) + offset;
+ bool retval = hsaCopyStaged(src, dst, totalSize, staging, true);
+
+ return retval;
+}
+
+bool
+DmaBlitManager::writeBuffer(
+ const void* srcHost,
+ device::Memory& dstMemory,
+ const amd::Coord3D& origin,
+ const amd::Coord3D& size,
+ bool entire) const
+{
+ // Use host copy if memory has direct access
+ if (setup_.disableWriteBuffer_ ||
+ gpuMem(dstMemory).isHostMemDirectAccess()) {
+ return HostBlitManager::writeBuffer(
+ srcHost, dstMemory, origin, size, entire);
+ }
+ else {
+ size_t dstSize = size[0];
+ size_t tmpSize = 0;
+ size_t offset = 0;
+ size_t pinSize = dev().settings().pinnedXferSize_;
+ pinSize = std::min(pinSize, dstSize);
+
+ // Check if a pinned transfer can be executed
+ if (pinSize && (dstSize > MinSizeForPinnedTransfer)) {
+ // Allign offset to 4K boundary (Vista/Win7 limitation)
+ char* tmpHost = const_cast(
+ amd::alignDown(reinterpret_cast(srcHost),
+ PinnedMemoryAlignment));
+
+ // Find the partial size for unaligned copy
+ size_t partial = reinterpret_cast(srcHost) - tmpHost;
+
+ amd::Memory* pinned = NULL;
+ bool first = true;
+ size_t tmpSize;
+ size_t pinAllocSize;
+
+ // Copy memory, using pinning
+ while (dstSize > 0) {
+ // If it's the first iterarion, then readjust the copy size
+ // to include alignment
+ if (first) {
+ pinAllocSize = amd::alignUp(pinSize + partial,
+ PinnedMemoryAlignment);
+ tmpSize = std::min(pinAllocSize - partial, dstSize);
+ first = false;
+ }
+ else {
+ tmpSize = std::min(pinSize, dstSize);
+ pinAllocSize = amd::alignUp(tmpSize, PinnedMemoryAlignment);
+ partial = 0;
+ }
+ amd::Coord3D src(partial, 0, 0);
+ amd::Coord3D dstPin(origin[0] + offset, 0, 0);
+ amd::Coord3D copySizePin(tmpSize, 0, 0);
+ size_t partial2;
+
+ // Allocate a GPU resource for pinning
+ pinned = pinHostMemory(tmpHost, pinAllocSize, partial2);
+
+ if (pinned != NULL) {
+ // Get device memory for this virtual device
+ Memory* srcMemory = dev().getRocMemory(pinned);
+
+ if (!hsaCopy(*srcMemory, gpuMem(dstMemory), src, dstPin,
+ copySizePin)) {
+ LogWarning("DmaBlitManager::writeBuffer failed a pinned copy!");
+ gpu().addPinnedMem(pinned);
+ break;
+ }
+ gpu().addPinnedMem(pinned);
+ }
+ else {
+ LogWarning("DmaBlitManager::writeBuffer failed to pin a resource!");
+ break;
+ }
+ dstSize -= tmpSize;
+ offset += tmpSize;
+ tmpHost = reinterpret_cast(tmpHost) + tmpSize + partial;
+ }
+ }
+
+ if (dstSize != 0) {
+ Memory& xferBuf = dev().xferWrite().acquire();
+
+ // Write memory using a staging resource
+ if (!writeMemoryStaged(srcHost, gpuMem(dstMemory), xferBuf, origin[0],
+ offset, dstSize, dstSize)) {
+ LogError("DmaBlitManager::writeBuffer failed!");
+ return false;
+ }
+
+ gpu().addXferWrite(xferBuf);
+ }
}
- if (status == HSA_STATUS_SUCCESS) {
- memcpy(hostDst, hsaBuffer, size);
+ return true;
+}
+
+bool
+DmaBlitManager::writeBufferRect(
+ const void* srcHost,
+ device::Memory& dstMemory,
+ const amd::BufferRect& hostRect,
+ const amd::BufferRect& bufRect,
+ const amd::Coord3D& size,
+ bool entire) const
+{
+ // Use host copy if memory has direct access
+ if (setup_.disableWriteBufferRect_ || dstMemory.isHostMemDirectAccess()) {
+ return HostBlitManager::writeBufferRect(
+ srcHost, dstMemory, hostRect, bufRect, size, entire);
}
- } else {
- LogPrintfError("Hsa copy from device to host failed with code %d", status);
- }
-
- roc_device_.hostFree(hsaBuffer, size);
- return (status == HSA_STATUS_SUCCESS);
-}
+ else {
+ Memory& xferBuf = dev().xferWrite().acquire();
+ address staging = xferBuf.getDeviceMemory();
+ address dst = static_cast(dstMemory).getDeviceMemory();
-bool HsaBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost,
- const amd::Coord3D& origin,
- const amd::Coord3D& size, bool entire) const {
- hsa_memory_register(dstHost, size[0]);
- void* src = static_cast(srcMemory).getDeviceMemory();
+ size_t srcOffset;
+ size_t dstOffset;
- // Copy data from device to host
- const void *srcDev = reinterpret_cast(src) + origin[0];
- bool retval = hsaCopy(srcDev, dstHost, size[0], false);
+ for (size_t z = 0; z < size[2]; ++z) {
+ for (size_t y = 0; y < size[1]; ++y) {
+ srcOffset = hostRect.offset(0, y, z);
+ dstOffset = bufRect.offset(0, y, z);
- hsa_memory_deregister(dstHost, size[0]);
- return retval;
-}
-
-bool HsaBlitManager::readBufferRect(device::Memory& srcMemory, void* dst,
- const amd::BufferRect& bufRect,
- const amd::BufferRect& hostRect,
- const amd::Coord3D& size,
- bool entire) const {
- void* src = static_cast(srcMemory).getDeviceMemory();
-
- size_t srcOffset;
- size_t dstOffset;
-
- for (size_t z = 0; z < size[2]; ++z) {
- for (size_t y = 0; y < size[1]; ++y) {
- srcOffset = bufRect.offset(0, y, z);
- dstOffset = hostRect.offset(0, y, z);
-
- // Copy data from device to host - line by line
- void *dstHost = reinterpret_cast(dst) + dstOffset;
- const void *srcDev = reinterpret_cast(src) + srcOffset;
- bool retval = hsaCopy(srcDev, dstHost, size[0], false);
- if (!retval) {
- return retval;
- }
- }
- }
-
- return true;
-}
-
-static bool hsaCopyImageToBuffer(hsa_agent_t agent,
- hsa_ext_image_t srcImage,
- void* dstBuffer, const amd::Coord3D& srcOrigin,
- const amd::Coord3D& dstOrigin,
- const amd::Coord3D& size, bool entire,
- size_t rowPitch, size_t slicePitch) {
- hsa_ext_image_region_t image_region;
- image_region.offset.x = srcOrigin[0];
- image_region.offset.y = srcOrigin[1];
- image_region.offset.z = srcOrigin[2];
- image_region.range.x = size[0];
- image_region.range.y = size[1];
- image_region.range.z = size[2];
-
- char *dstHost = ((char*)dstBuffer) + dstOrigin[0];
-
- hsa_status_t status = hsa_ext_image_export(agent, srcImage, dstHost, rowPitch,
- slicePitch, &image_region);
- return (status == HSA_STATUS_SUCCESS);
-}
-
-bool HsaBlitManager::readImage(device::Memory& srcMemory, void* dstHost,
- const amd::Coord3D& origin,
- const amd::Coord3D& size, size_t rowPitch,
- size_t slicePitch, bool entire) const {
- roc::Image* srcImage = (roc::Image*)&srcMemory;
-
- void* svmDstHost = NULL;
- size_t pinSize = 0;
- FindPinSize(pinSize, size, rowPitch, slicePitch, *srcImage);
-
- hsa_agent_t agent = gpu().gpu_device();
-
- hsa_status_t status = hsa_amd_memory_lock(dstHost, pinSize,
- &agent, 1, &svmDstHost);
-
- if (status != HSA_STATUS_SUCCESS) {
- return false;
- }
-
- bool retval = hsaCopyImageToBuffer(agent, srcImage->getHsaImageObject(),
- svmDstHost, origin, amd::Coord3D(0), size, entire,
- rowPitch, slicePitch);
- hsa_amd_memory_unlock(dstHost);
- return retval;
-}
-
-bool HsaBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemory,
- const amd::Coord3D& origin,
- const amd::Coord3D& size, bool entire) const {
- hsa_memory_register(const_cast(srcHost), size[0]);
- void* dst = static_cast(dstMemory).getDeviceMemory();
-
- // Copy data from host to device
- void *dstDev = reinterpret_cast(dst) + origin[0];
- bool retval = hsaCopy(srcHost, dstDev, size[0], true);
-
- hsa_memory_deregister(const_cast(srcHost), size[0]);
- return retval;
-}
-
-bool HsaBlitManager::writeBufferRect(const void* src,
- device::Memory& dstMemory,
- const amd::BufferRect& hostRect,
- const amd::BufferRect& bufRect,
- const amd::Coord3D& size,
- bool entire) const {
- void* dst = static_cast(dstMemory).getDeviceMemory();
-
- size_t srcOffset;
- size_t dstOffset;
-
- for (size_t z = 0; z < size[2]; ++z) {
- for (size_t y = 0; y < size[1]; ++y) {
- srcOffset = hostRect.offset(0, y, z);
- dstOffset = bufRect.offset(0, y, z);
-
- // Copy data from host to device - line by line
- void *dstDev = reinterpret_cast(dst) + dstOffset;
- const void *srcHost = reinterpret_cast(src) + srcOffset;
- bool retval = hsaCopy(srcHost, dstDev, size[0], true);
- if (!retval) {
- return retval;
- }
- }
- }
-
- return true;
-}
-
-bool hsaCopyBufferToImage(hsa_agent_t agent, const void* srcBuffer,
- hsa_ext_image_t dstImage,
- const amd::Coord3D& srcOrigin,
- const amd::Coord3D& dstOrigin,
- const amd::Coord3D& size, bool entire,
- size_t rowPitch, size_t slicePitch) {
- char* srcHost = ((char*)srcBuffer) + srcOrigin[0];
-
- hsa_ext_image_region_t image_region;
- image_region.offset.x = dstOrigin[0];
- image_region.offset.y = dstOrigin[1];
- image_region.offset.z = dstOrigin[2];
- image_region.range.x = size[0];
- image_region.range.y = size[1];
- image_region.range.z = size[2];
-
- hsa_status_t status = hsa_ext_image_import(
- agent, srcHost, rowPitch, slicePitch, dstImage, &image_region);
- return (status == HSA_STATUS_SUCCESS);
-}
-
-bool HsaBlitManager::writeImage(const void* srcHost, device::Memory& dstMemory,
- const amd::Coord3D& origin,
- const amd::Coord3D& size, size_t rowPitch,
- size_t slicePitch, bool entire) const {
- roc::Image* image = (roc::Image*)&dstMemory;
-
- void* svmSrcHost = NULL;
- size_t pinSize = 0;
- FindPinSize(pinSize, size, rowPitch, slicePitch, *image);
-
- hsa_agent_t agent = gpu().gpu_device();
-
- hsa_status_t status = hsa_amd_memory_lock(const_cast(srcHost), pinSize,
- &agent, 1, &svmSrcHost);
-
- if (status != HSA_STATUS_SUCCESS) {
- return false;
- }
-
- bool retval = hsaCopyBufferToImage(agent, svmSrcHost,
- image->getHsaImageObject(), amd::Coord3D(0),
- origin, size, entire, rowPitch, slicePitch);
-
- hsa_amd_memory_unlock(const_cast(srcHost));
-
- return retval;
-}
-
-bool HsaBlitManager::copyBuffer(device::Memory& srcMemory,
- device::Memory& dstMemory,
- const amd::Coord3D& srcOrigin,
- const amd::Coord3D& dstOrigin,
- const amd::Coord3D& size, bool entire) const {
- void* src = static_cast(srcMemory).getDeviceMemory();
- void* dst = static_cast(dstMemory).getDeviceMemory();
-
- if (srcMemory.isHostMemDirectAccess() && dstMemory.isHostMemDirectAccess()) {
- if (srcMemory.owner()->getMemFlags() & CL_MEM_USE_HOST_PTR) {
- src = srcMemory.owner()->getHostMem();
+ // Copy data from host to device - line by line
+ dst += dstOffset;
+ const_address src = reinterpret_cast(srcHost) + srcOffset;
+ bool retval = hsaCopyStaged(src, dst, size[0], staging, true);
+ if (!retval) {
+ return retval;
+ }
+ }
+ }
+ gpu().addXferWrite(xferBuf);
}
- if (dstMemory.owner()->getMemFlags() & CL_MEM_USE_HOST_PTR) {
- dst = dstMemory.owner()->getHostMem();
- }
- }
-
- const hsa_agent_t src_agent = (srcMemory.isHostMemDirectAccess())
- ? roc_device_.getCpuAgent()
- : roc_device_.getBackendDevice();
-
- const hsa_agent_t dst_agent = (dstMemory.isHostMemDirectAccess())
- ? roc_device_.getCpuAgent()
- : roc_device_.getBackendDevice();
-
- // Straight forward buffer copy
- const hsa_signal_value_t kInitVal = 1;
- hsa_signal_store_relaxed(completion_signal_, kInitVal);
- hsa_status_t status = hsa_amd_memory_async_copy(
- (reinterpret_cast(dst) + dstOrigin[0]), dst_agent,
- (reinterpret_cast(src) + srcOrigin[0]), src_agent, size[0],
- 0, NULL, completion_signal_);
- if (status != HSA_STATUS_SUCCESS) {
- LogPrintfError("DMA buffer failed with code %d", status);
- return false;
- }
-
- hsa_signal_value_t val =
- hsa_signal_wait_acquire(completion_signal_, HSA_SIGNAL_CONDITION_EQ, 0,
- uint64_t(-1), HSA_WAIT_STATE_ACTIVE);
-
- if (val != (kInitVal - 1)) {
- LogError("Async copy failed");
- return false;
- }
-
- return true;
+ return true;
}
-bool HsaBlitManager::copyBufferRect(device::Memory& srcMemory,
- device::Memory& dstMemory,
- const amd::BufferRect& srcRect,
- const amd::BufferRect& dstRect,
- const amd::Coord3D& size,
- bool entire) const {
- void* src = static_cast(srcMemory).getDeviceMemory();
- void* dst = static_cast(dstMemory).getDeviceMemory();
+bool
+DmaBlitManager::writeImage(
+ const void* srcHost,
+ device::Memory& dstMemory,
+ const amd::Coord3D& origin,
+ const amd::Coord3D& size,
+ size_t rowPitch,
+ size_t slicePitch,
+ bool entire) const
+{
+ if (setup_.disableWriteImage_) {
+ return HostBlitManager::writeImage(
+ srcHost, dstMemory, origin, size, rowPitch, slicePitch, entire);
+ }
+ else {
+ //! @todo Add HW accelerated path
+ return HostBlitManager::writeImage(
+ srcHost, dstMemory, origin, size, rowPitch, slicePitch, entire);
+ }
- const hsa_signal_value_t kInitVal = size[2] * size[1];
- hsa_signal_store_relaxed(completion_signal_, kInitVal);
+ return true;
+}
- for (size_t z = 0; z < size[2]; ++z) {
- for (size_t y = 0; y < size[1]; ++y) {
- size_t srcOffset = srcRect.offset(0, y, z);
- size_t dstOffset = dstRect.offset(0, y, z);
+bool
+DmaBlitManager::copyBuffer(
+ device::Memory& srcMemory,
+ device::Memory& dstMemory,
+ const amd::Coord3D& srcOrigin,
+ const amd::Coord3D& dstOrigin,
+ const amd::Coord3D& size,
+ bool entire) const
+{
+ if (setup_.disableCopyBuffer_ ||
+ (gpuMem(srcMemory).isHostMemDirectAccess() &&
+ (dev().agent_profile() != HSA_PROFILE_FULL) &&
+ gpuMem(dstMemory).isHostMemDirectAccess())) {
+ return HostBlitManager::copyBuffer(
+ srcMemory, dstMemory, srcOrigin, dstOrigin, size);
+ }
+ else {
+ return hsaCopy(gpuMem(srcMemory), gpuMem(dstMemory),
+ srcOrigin, dstOrigin, size);
+ }
- // Copy memory line by line
- hsa_status_t status = hsa_amd_memory_async_copy(
- (reinterpret_cast(dst) + dstOffset),
- roc_device_.getBackendDevice(),
- (reinterpret_cast(src) + srcOffset),
- roc_device_.getBackendDevice(), size[0], 0, NULL,
- completion_signal_);
- if (status != HSA_STATUS_SUCCESS) {
- LogPrintfError("DMA buffer failed with code %d", status);
+ return true;
+}
+
+bool
+DmaBlitManager::copyBufferRect(
+ device::Memory& srcMemory,
+ device::Memory& dstMemory,
+ const amd::BufferRect& srcRect,
+ const amd::BufferRect& dstRect,
+ const amd::Coord3D& size,
+ bool entire) const
+{
+ if (setup_.disableCopyBufferRect_ ||
+ (gpuMem(srcMemory).isHostMemDirectAccess() &&
+ gpuMem(dstMemory).isHostMemDirectAccess())) {
+ return HostBlitManager::copyBufferRect(
+ srcMemory, dstMemory, srcRect, dstRect, size, entire);
+ }
+ else {
return false;
- }
+ void* src = gpuMem(srcMemory).getDeviceMemory();
+ void* dst = gpuMem(dstMemory).getDeviceMemory();
+
+ // Detect the agents for memory allocations
+ const hsa_agent_t srcAgent = (srcMemory.isHostMemDirectAccess()) ?
+ dev().getCpuAgent() : dev().getBackendDevice();
+ const hsa_agent_t dstAgent = (dstMemory.isHostMemDirectAccess()) ?
+ dev().getCpuAgent() : dev().getBackendDevice();
+
+ const hsa_signal_value_t kInitVal = size[2] * size[1];
+ hsa_signal_store_relaxed(completion_signal_, kInitVal);
+
+ for (size_t z = 0; z < size[2]; ++z) {
+ for (size_t y = 0; y < size[1]; ++y) {
+ size_t srcOffset = srcRect.offset(0, y, z);
+ size_t dstOffset = dstRect.offset(0, y, z);
+
+ // Copy memory line by line
+ hsa_status_t status = hsa_amd_memory_async_copy(
+ (reinterpret_cast(dst) + dstOffset), dstAgent,
+ (reinterpret_cast(src) + srcOffset),
+ srcAgent, size[0], 0, NULL, completion_signal_);
+ if (status != HSA_STATUS_SUCCESS) {
+ LogPrintfError("DMA buffer failed with code %d", status);
+ return false;
+ }
+ }
+ }
+
+ hsa_signal_value_t val =
+ hsa_signal_wait_acquire(completion_signal_, HSA_SIGNAL_CONDITION_EQ,
+ 0, uint64_t(-1), HSA_WAIT_STATE_ACTIVE);
+
+ if (val != 0) {
+ LogError("Async copy failed");
+ return false;
+ }
}
- }
-
- hsa_signal_value_t val =
- hsa_signal_wait_acquire(completion_signal_, HSA_SIGNAL_CONDITION_EQ,
- 0, uint64_t(-1), HSA_WAIT_STATE_ACTIVE);
-
- if (val != 0) {
- LogError("Async copy failed");
- return false;
- }
-
- return true;
+ return true;
}
-bool HsaBlitManager::copyImageToBuffer(device::Memory& srcMemory,
- device::Memory& dstMemory,
- const amd::Coord3D& srcOrigin,
- const amd::Coord3D& dstOrigin,
- const amd::Coord3D& size, bool entire,
- size_t rowPitch,
- size_t slicePitch) const {
- roc::Image& srcImage = (roc::Image&)srcMemory;
- roc::Buffer& dstBuffer = (roc::Buffer&)dstMemory;
+bool
+DmaBlitManager::copyImageToBuffer(
+ device::Memory& srcMemory,
+ device::Memory& dstMemory,
+ const amd::Coord3D& srcOrigin,
+ const amd::Coord3D& dstOrigin,
+ const amd::Coord3D& size,
+ bool entire,
+ size_t rowPitch,
+ size_t slicePitch) const
+{
+ bool result = false;
- return hsaCopyImageToBuffer(gpu().gpu_device(), srcImage.getHsaImageObject(),
- dstBuffer.getDeviceMemory(), srcOrigin, dstOrigin,
- size, entire, rowPitch, slicePitch);
-}
+ if (setup_.disableCopyImageToBuffer_) {
+ result = HostBlitManager::copyImageToBuffer(srcMemory, dstMemory,
+ srcOrigin, dstOrigin, size, entire, rowPitch, slicePitch);
+ }
+ else {
+ Image& srcImage = static_cast(srcMemory);
+ Buffer& dstBuffer = static_cast(dstMemory);
-bool HsaBlitManager::copyBufferToImage(device::Memory& srcMemory,
- device::Memory& dstMemory,
- const amd::Coord3D& srcOrigin,
- const amd::Coord3D& dstOrigin,
- const amd::Coord3D& size, bool entire,
- size_t rowPitch,
- size_t slicePitch) const {
- roc::Buffer& srcBuffer = (roc::Buffer&)srcMemory;
- roc::Image& dstImage = (roc::Image&)dstMemory;
+ // Use ROC path for a transfer
+ // Note: it doesn't support SDMA
+ address dstHost = reinterpret_cast(dstBuffer.getDeviceMemory()) +
+ dstOrigin[0];
- return hsaCopyBufferToImage(gpu().gpu_device(), srcBuffer.getDeviceMemory(),
- dstImage.getHsaImageObject(), srcOrigin,
- dstOrigin, size, entire, rowPitch, slicePitch);
-}
+ // Use ROCm path for a transfer.
+ // Note: it doesn't support SDMA
+ hsa_ext_image_region_t image_region;
+ image_region.offset.x = srcOrigin[0];
+ image_region.offset.y = srcOrigin[1];
+ image_region.offset.z = srcOrigin[2];
+ image_region.range.x = size[0];
+ image_region.range.y = size[1];
+ image_region.range.z = size[2];
-bool HsaBlitManager::copyImage(device::Memory& srcMemory,
- device::Memory& dstMemory,
- const amd::Coord3D& srcOrigin,
- const amd::Coord3D& dstOrigin,
- const amd::Coord3D& size, bool entire) const {
- if (srcMemory.isHostMemDirectAccess() &&
- dstMemory.isHostMemDirectAccess()) {
- return device::HostBlitManager::copyImage(
- srcMemory, dstMemory, srcOrigin, dstOrigin, size, entire);
- }
+ hsa_status_t status = hsa_ext_image_export(gpu().gpu_device(),
+ srcImage.getHsaImageObject(), dstHost, rowPitch,
+ slicePitch, &image_region);
+ result = (status == HSA_STATUS_SUCCESS) ? true : false;
- roc::Image *srcImage = (roc::Image *)&srcMemory;
- roc::Image *dstImage = (roc::Image *)&dstMemory;
-
- hsa_dim3_t src_offset = { 0 };
- src_offset.x = srcOrigin[0];
- src_offset.y = srcOrigin[1];
- src_offset.z = srcOrigin[2];
-
- hsa_dim3_t dst_offset = { 0 };
- dst_offset.x = dstOrigin[0];
- dst_offset.y = dstOrigin[1];
- dst_offset.z = dstOrigin[2];
-
- hsa_dim3_t copy_size = { 0 };
- copy_size.x = size[0];
- copy_size.y = size[1];
- copy_size.z = size[2];
-
- hsa_status_t status = hsa_ext_image_copy(
- gpu().gpu_device(), srcImage->getHsaImageObject(), &src_offset,
- dstImage->getHsaImageObject(), &dst_offset, ©_size);
- return (status == HSA_STATUS_SUCCESS);
-}
-
-bool HsaBlitManager::fillBuffer(device::Memory& memory, const void* pattern,
- size_t patternSize, const amd::Coord3D& origin,
- const amd::Coord3D& size, bool entire) const {
- void* fillMem = static_cast(memory).getDeviceMemory();
-
- size_t offset = origin[0];
- size_t fillSize = size[0];
-
- if ((fillSize % patternSize) != 0) {
- LogError("Misaligned buffer size and pattern size!");
- }
-
- // Fill the buffer memory with a pattern
- for (size_t i = 0; i < (fillSize / patternSize); i++) {
- void *dstDev = reinterpret_cast(fillMem) + offset;
- bool retval = hsaCopy(pattern, dstDev, patternSize, true);
- if (!retval) {
- LogError("DMA buffer failed with code");
- return retval;
+ // Check if a HostBlit transfer is required
+ if (completeOperation_ && !result) {
+ result = HostBlitManager::copyImageToBuffer(srcMemory,
+ dstMemory, srcOrigin, dstOrigin, size, entire, rowPitch, slicePitch);
+ }
}
- offset += patternSize;
- }
-
- return true;
+ return result;
}
-bool HsaBlitManager::fillImage(device::Memory& memory, const void* pattern,
- const amd::Coord3D& origin,
- const amd::Coord3D& size, bool entire) const {
- if (memory.isHostMemDirectAccess()) {
- return device::HostBlitManager::fillImage(memory, pattern, origin, size, entire);
- }
+bool
+DmaBlitManager::copyBufferToImage(
+ device::Memory& srcMemory,
+ device::Memory& dstMemory,
+ const amd::Coord3D& srcOrigin,
+ const amd::Coord3D& dstOrigin,
+ const amd::Coord3D& size,
+ bool entire,
+ size_t rowPitch,
+ size_t slicePitch) const
+{
+ bool result = false;
- roc::Image *image = (roc::Image*)&memory;
- hsa_ext_image_region_t image_region;
- image_region.offset.x = origin[0];
- image_region.offset.y = origin[1];
- image_region.offset.z = origin[2];
- image_region.range.x = size[0];
- image_region.range.y = size[1];
- image_region.range.z = size[2];
+ if (setup_.disableCopyBufferToImage_) {
+ result = HostBlitManager::copyBufferToImage(srcMemory,
+ dstMemory, srcOrigin, dstOrigin, size, entire, rowPitch, slicePitch);
+ }
+ else {
+ Buffer& srcBuffer = static_cast(srcMemory);
+ Image& dstImage = static_cast(dstMemory);
- hsa_status_t status = hsa_ext_image_clear(
- gpu().gpu_device(), image->getHsaImageObject(),
- pattern, &image_region);
- return (status == HSA_STATUS_SUCCESS);
+ // Use ROC path for a transfer
+ // Note: it doesn't support SDMA
+ address srcHost = reinterpret_cast(srcBuffer.getDeviceMemory()) +
+ srcOrigin[0];
+
+ hsa_ext_image_region_t image_region;
+ image_region.offset.x = dstOrigin[0];
+ image_region.offset.y = dstOrigin[1];
+ image_region.offset.z = dstOrigin[2];
+ image_region.range.x = size[0];
+ image_region.range.y = size[1];
+ image_region.range.z = size[2];
+
+ hsa_status_t status = hsa_ext_image_import(gpu().gpu_device(),
+ srcHost, rowPitch, slicePitch, dstImage.getHsaImageObject(), &image_region);
+ result = (status == HSA_STATUS_SUCCESS) ? true : false;
+
+ // Check if a HostBlit tran sfer is required
+ if (completeOperation_ && !result) {
+ result = HostBlitManager::copyBufferToImage(srcMemory,
+ dstMemory, srcOrigin, dstOrigin, size, entire, rowPitch, slicePitch);
+ }
+ }
+
+ return result;
}
-static void
+bool
+DmaBlitManager::copyImage(
+ device::Memory& srcMemory,
+ device::Memory& dstMemory,
+ const amd::Coord3D& srcOrigin,
+ const amd::Coord3D& dstOrigin,
+ const amd::Coord3D& size,
+ bool entire) const
+{
+ bool result = false;
+
+ if (setup_.disableCopyImage_) {
+ return HostBlitManager::copyImage(srcMemory, dstMemory,
+ srcOrigin, dstOrigin, size, entire);
+ }
+ else {
+ //! @todo Add HW accelerated path
+ return HostBlitManager::copyImage(srcMemory, dstMemory,
+ srcOrigin, dstOrigin, size, entire);
+ }
+
+ return result;
+}
+
+bool DmaBlitManager::hsaCopy(
+ const Memory& srcMemory,
+ const Memory& dstMemory,
+ const amd::Coord3D& srcOrigin,
+ const amd::Coord3D& dstOrigin,
+ const amd::Coord3D& size,
+ bool enableCopyRect,
+ bool flushDMA) const
+{
+ address src = reinterpret_cast(srcMemory.getDeviceMemory());
+ address dst = reinterpret_cast(dstMemory.getDeviceMemory());
+
+ src += srcOrigin[0];
+ dst += dstOrigin[0];
+
+ // Just call copy function for full profile
+ hsa_status_t status;
+ if (dev().agent_profile() == HSA_PROFILE_FULL) {
+ status = hsa_memory_copy(dst, src, size[0]);
+ if (status != HSA_STATUS_SUCCESS) {
+ LogPrintfError("Hsa copy of data failed with code %d", status);
+ }
+ return (status == HSA_STATUS_SUCCESS);
+ }
+
+ // Detect the agents for memory allocations
+ const hsa_agent_t srcAgent = (srcMemory.isHostMemDirectAccess()) ?
+ dev().getCpuAgent() : dev().getBackendDevice();
+ const hsa_agent_t dstAgent = (dstMemory.isHostMemDirectAccess()) ?
+ dev().getCpuAgent() : dev().getBackendDevice();
+
+ const hsa_signal_value_t kInitVal = 1;
+ hsa_signal_store_relaxed(completion_signal_, kInitVal);
+
+ // Use SDMA to transfer the data
+ status = hsa_amd_memory_async_copy(dst, dstAgent, src, srcAgent,
+ size[0], 0, nullptr, completion_signal_);
+ if (status == HSA_STATUS_SUCCESS) {
+ hsa_signal_value_t val = hsa_signal_wait_acquire(
+ completion_signal_, HSA_SIGNAL_CONDITION_EQ, 0,
+ uint64_t(-1), HSA_WAIT_STATE_ACTIVE);
+ if (val != (kInitVal - 1)) {
+ LogError("Async copy failed");
+ status = HSA_STATUS_ERROR;
+ }
+ }
+ else {
+ LogPrintfError("Hsa copy from host to device failed with code %d", status);
+ }
+
+ return (status == HSA_STATUS_SUCCESS);
+}
+
+bool DmaBlitManager::hsaCopyStaged(
+ const_address hostSrc, address hostDst, size_t size, address staging, bool hostToDev) const
+{
+ // No allocation is necessary for Full Profile
+ hsa_status_t status;
+ if (dev().agent_profile() == HSA_PROFILE_FULL) {
+ status = hsa_memory_copy(hostDst, hostSrc, size);
+ if (status != HSA_STATUS_SUCCESS) {
+ LogPrintfError("Hsa copy of data failed with code %d", status);
+ }
+ return (status == HSA_STATUS_SUCCESS);
+ }
+
+ size_t totalSize = size;
+ size_t offset = 0;
+
+ address hsaBuffer = staging;
+
+ const hsa_signal_value_t kInitVal = 1;
+
+ // Allocate requested size of memory
+ while (totalSize > 0) {
+ size = std::min(totalSize, dev().settings().stagedXferSize_);
+ hsa_signal_store_relaxed(completion_signal_, kInitVal);
+
+ // Copy data from Host to Device
+ if (hostToDev) {
+ memcpy(hsaBuffer, hostSrc + offset, size);
+ status = hsa_amd_memory_async_copy(
+ hostDst + offset, dev().getBackendDevice(), hsaBuffer,
+ dev().getCpuAgent(), size, 0, NULL, completion_signal_);
+ if (status == HSA_STATUS_SUCCESS) {
+ hsa_signal_value_t val =
+ hsa_signal_wait_acquire(completion_signal_,
+ HSA_SIGNAL_CONDITION_EQ, 0,
+ uint64_t(-1), HSA_WAIT_STATE_ACTIVE);
+
+ if (val != (kInitVal - 1)) {
+ LogError("Async copy failed");
+ return false;
+ }
+ }
+ else {
+ LogPrintfError("Hsa copy from host to device failed with code %d", status);
+ return false;
+ }
+ totalSize -= size;
+ offset += size;
+ continue;
+ }
+
+ // Copy data from Device to Host
+ status = hsa_amd_memory_async_copy(hsaBuffer,
+ dev().getCpuAgent(), hostSrc + offset, dev().getBackendDevice(),
+ size, 0, NULL, completion_signal_);
+ if (status == HSA_STATUS_SUCCESS) {
+ hsa_signal_value_t val = hsa_signal_wait_acquire(
+ completion_signal_, HSA_SIGNAL_CONDITION_EQ, 0, uint64_t(-1),
+ HSA_WAIT_STATE_ACTIVE);
+
+ if (val != (kInitVal - 1)) {
+ LogError("Async copy failed");
+ return false;
+ }
+ memcpy(hostDst + offset, hsaBuffer, size);
+ }
+ else {
+ LogPrintfError("Hsa copy from device to host failed with code %d", status);
+ return false;
+ }
+ totalSize -= size;
+ offset += size;
+ }
+
+ return true;
+}
+
+KernelBlitManager::KernelBlitManager(
+ VirtualGPU& gpu, Setup setup)
+ : DmaBlitManager(gpu, setup)
+ , program_(NULL)
+ , constantBuffer_(NULL)
+ , xferBufferSize_(0)
+ , lockXferOps_(NULL)
+{
+ for (uint i = 0; i < BlitTotal; ++i) {
+ kernels_[i] = NULL;
+ }
+
+ for (uint i = 0; i < MaxXferBuffers; ++i) {
+ xferBuffers_[i] = NULL;
+ }
+
+ completeOperation_ = false;
+}
+
+KernelBlitManager::~KernelBlitManager()
+{
+ for (uint i = 0; i < BlitTotal; ++i) {
+ if (NULL != kernels_[i]) {
+ kernels_[i]->release();
+ }
+ }
+ if (NULL != program_) {
+ program_->release();
+ }
+
+ if (NULL != context_) {
+ // Release a dummy context
+ context_->release();
+ }
+
+ if (NULL != constantBuffer_) {
+ constantBuffer_->release();
+ }
+
+ for (uint i = 0; i < MaxXferBuffers; ++i) {
+ if (NULL != xferBuffers_[i]) {
+ xferBuffers_[i]->release();
+ }
+ }
+
+ delete lockXferOps_;
+}
+
+bool
+KernelBlitManager::create(amd::Device& device)
+{
+ if (!DmaBlitManager::create(device)) {
+ return false;
+ }
+
+ if (!createProgram(static_cast(device))) {
+ return false;
+ }
+ return true;
+}
+
+bool
+KernelBlitManager::createProgram(Device& device)
+{
+ if (device.blitProgram() == nullptr) {
+ return false;
+ }
+
+ std::vector devices;
+ devices.push_back(&device);
+
+ // Save context and program for this device
+ context_ = device.blitProgram()->context_;
+ context_->retain();
+ program_ = device.blitProgram()->program_;
+ program_->retain();
+
+ bool result = false;
+ do {
+ // Create kernel objects for all blits
+ for (uint i = 0; i < BlitTotal; ++i) {
+ const amd::Symbol* symbol = program_->findSymbol(BlitName[i]);
+ if (symbol == NULL) {
+ break;
+ }
+ kernels_[i] = new amd::Kernel(*program_, *symbol, BlitName[i]);
+ if (kernels_[i] == NULL) {
+ break;
+ }
+ // Validate blit kernels for the scratch memory usage (pre SI)
+ if (!device.validateKernel(*kernels_[i], &gpu())) {
+ break;
+ }
+ }
+
+ result = true;
+ } while(!result);
+
+ // Create an internal constant buffer
+ constantBuffer_ = new (*context_)
+ amd::Buffer(*context_, CL_MEM_ALLOC_HOST_PTR, 4 * Ki);
+
+ if ((constantBuffer_ != NULL) && !constantBuffer_->create(NULL)) {
+ constantBuffer_->release();
+ constantBuffer_ = NULL;
+ return false;
+ }
+ else if (constantBuffer_ == NULL) {
+ return false;
+ }
+
+ // Assign the constant buffer to the current virtual GPU
+ constantBuffer_->setVirtualDevice(&gpu());
+
+ if (dev().settings().xferBufSize_ > 0) {
+ xferBufferSize_ = dev().settings().xferBufSize_;
+ for (uint i = 0; i < MaxXferBuffers; ++i) {
+ // Create internal xfer buffers for image copy optimization
+ xferBuffers_[i] = new (*context_)
+ amd::Buffer(*context_, 0, xferBufferSize_);
+
+ if ((xferBuffers_[i] != NULL) && !xferBuffers_[i]->create(NULL)) {
+ xferBuffers_[i]->release();
+ xferBuffers_[i] = NULL;
+ return false;
+ }
+ else if (xferBuffers_[i] == NULL) {
+ return false;
+ }
+
+ // Assign the xfer buffer to the current virtual GPU
+ xferBuffers_[i]->setVirtualDevice(&gpu());
+ //! @note Workaround for conformance allocation test.
+ //! Force GPU mem alloc.
+ //! Unaligned images require xfer optimization,
+ //! but deferred memory allocation can cause
+ //! virtual heap fragmentation for big allocations and
+ //! then fail the following test with 32 bit ISA, because
+ //! runtime runs out of 4GB space.
+ dev().getRocMemory(xferBuffers_[i]);
+ }
+ }
+
+ lockXferOps_ = new amd::Monitor("Transfer Ops Lock", true);
+ if (NULL == lockXferOps_) {
+ return false;
+ }
+
+ return result;
+}
+
+// The following data structures will be used for the view creations.
+// Some formats has to be converted before a kernel blit operation
+struct FormatConvertion {
+ cl_uint clOldType_;
+ cl_uint clNewType_;
+};
+
+// The list of rejected data formats and corresponding conversion
+static const FormatConvertion RejectedData[] =
+{
+ { CL_UNORM_INT8, CL_UNSIGNED_INT8 },
+ { CL_UNORM_INT16, CL_UNSIGNED_INT16 },
+ { CL_SNORM_INT8, CL_UNSIGNED_INT8 },
+ { CL_SNORM_INT16, CL_UNSIGNED_INT16 },
+ { CL_HALF_FLOAT, CL_UNSIGNED_INT16 },
+ { CL_FLOAT, CL_UNSIGNED_INT32 },
+ { CL_SIGNED_INT8, CL_UNSIGNED_INT8 },
+ { CL_SIGNED_INT16, CL_UNSIGNED_INT16 },
+ { CL_UNORM_INT_101010, CL_UNSIGNED_INT8 },
+ { CL_SIGNED_INT32, CL_UNSIGNED_INT32 }
+};
+
+// The list of rejected channel's order and corresponding conversion
+static const FormatConvertion RejectedOrder[] =
+{
+ { CL_A, CL_R },
+ { CL_RA, CL_RG },
+ { CL_LUMINANCE, CL_R },
+ { CL_INTENSITY, CL_R },
+ { CL_RGB, CL_RGBA },
+ { CL_BGRA, CL_RGBA },
+ { CL_ARGB, CL_RGBA },
+ { CL_sRGB, CL_RGBA },
+ { CL_sRGBx, CL_RGBA },
+ { CL_sRGBA, CL_RGBA },
+ { CL_sBGRA, CL_RGBA },
+ { CL_DEPTH, CL_R }
+};
+
+const uint RejectedFormatDataTotal =
+ sizeof(RejectedData) / sizeof(FormatConvertion);
+const uint RejectedFormatChannelTotal =
+ sizeof(RejectedOrder) / sizeof(FormatConvertion);
+
+bool
+KernelBlitManager::copyBufferToImage(
+ device::Memory& srcMemory,
+ device::Memory& dstMemory,
+ const amd::Coord3D& srcOrigin,
+ const amd::Coord3D& dstOrigin,
+ const amd::Coord3D& size,
+ bool entire,
+ size_t rowPitch,
+ size_t slicePitch) const
+{
+ amd::ScopedLock k(lockXferOps_);
+ bool result = false;
+ static const bool CopyRect = false;
+ // Flush DMA for ASYNC copy
+ static const bool FlushDMA = true;
+ size_t imgRowPitch = size[0] * gpuMem(dstMemory).owner()->asImage()->getImageFormat().getElementSize();
+ size_t imgSlicePitch = imgRowPitch * size[1];
+
+ if (setup_.disableCopyBufferToImage_) {
+ result = DmaBlitManager::copyBufferToImage(
+ srcMemory, dstMemory, srcOrigin, dstOrigin, size,
+ entire, rowPitch, slicePitch);
+ synchronize();
+ return result;
+ }
+ // Check if buffer is in system memory with direct access
+ else if (gpuMem(srcMemory).isHostMemDirectAccess() &&
+ (((rowPitch == 0) && (slicePitch == 0)) ||
+ ((rowPitch == imgRowPitch) &&
+ ((slicePitch == 0) || (slicePitch == imgSlicePitch))))) {
+ // First attempt to do this all with DMA,
+ // but there are restriciton with older hardware
+ if (dev().settings().imageDMA_) {
+ result = DmaBlitManager::copyBufferToImage(
+ srcMemory, dstMemory, srcOrigin, dstOrigin, size,
+ entire, rowPitch, slicePitch);
+ if (result) {
+ synchronize();
+ return result;
+ }
+ }
+ }
+
+ if (!result) {
+ result = copyBufferToImageKernel(srcMemory,
+ dstMemory, srcOrigin, dstOrigin, size, entire, rowPitch, slicePitch);
+ }
+
+ synchronize();
+
+ return result;
+}
+
+void
CalcRowSlicePitches(
cl_ulong* pitch, const cl_int* copySize,
size_t rowPitch, size_t slicePitch, const Memory& mem)
{
- const roc::Image &hsaImage = static_cast< const roc::Image &>(mem);
- bool img1Darray =
- (mem.owner()->getType() == CL_MEM_OBJECT_IMAGE1D_ARRAY) ? true : false;
- size_t memFmtSize = mem.owner()->asImage()->getImageFormat().getElementSize();
+ uint32_t memFmtSize = mem.owner()->asImage()->getImageFormat().getElementSize();
+ bool img1Darray = (mem.owner()->getType() == CL_MEM_OBJECT_IMAGE1D_ARRAY) ? true : false;
if (rowPitch == 0) {
pitch[0] = copySize[0];
@@ -544,265 +1012,546 @@ CalcRowSlicePitches(
}
}
-KernelBlitManager::KernelBlitManager(device::VirtualDevice& vDev, Setup setup)
- : HsaBlitManager(vDev, setup),
- context_(NULL),
- program_(NULL)
+static void
+setArgument(amd::Kernel* kernel, size_t index, size_t size, const void* value)
{
- for (uint i = 0; i < BlitTotal; ++i) {
- kernels_[i] = NULL;
- }
-}
-
-KernelBlitManager::~KernelBlitManager()
-{
- for (uint i = 0; i < BlitTotal; ++i) {
- if (NULL != kernels_[i]) {
- kernels_[i]->release();
- }
- }
-
- if (NULL != program_) {
- program_->release();
- }
-
- if (NULL != context_) {
- // Release a dummy context
- context_->release();
- }
+ kernel->parameters().set(index, size, value);
}
bool
-KernelBlitManager::readBuffer(
- device::Memory& srcMemory,
- void* dstHost,
- const amd::Coord3D& origin,
+KernelBlitManager::copyBufferToImageKernel(
+ device::Memory& srcMemory,
+ device::Memory& dstMemory,
+ const amd::Coord3D& srcOrigin,
+ const amd::Coord3D& dstOrigin,
const amd::Coord3D& size,
- bool entire) const
+ bool entire,
+ size_t rowPitch,
+ size_t slicePitch) const
{
- //if (setup_.disableReadBuffer_ || srcMemory.isHostMemDirectAccess()) {
- // return device::HostBlitManager::readBuffer(srcMemory, dstHost, origin,
- // size, entire);
- //}
- // Exercise HSA path for now.
- return HsaBlitManager::readBuffer(srcMemory, dstHost, origin,
- size, entire);
-
- amd::Buffer *dstMemory = new (*context_) amd::Buffer(
- *context_, CL_MEM_USE_HOST_PTR, size[0]);
-
- if (!dstMemory->create(const_cast(dstHost))) {
- LogError("[OCL] Fail to create mem object for destination");
- return false;
- }
-
- device::Memory *devDstMemory = dstMemory->getDeviceMemory(dev_);
- if (devDstMemory== NULL) {
- LogError("[OCL] Fail to create device mem object for destination");
- return false;
- }
-
- bool result = copyBuffer(
- srcMemory, *devDstMemory, origin, amd::Coord3D(0), size, entire);
-
- // Wait for the transfer to finish so that we could safely release the
- // destination memory object.
- // TODO: we could remove this if issue on implicit memory registration is
- // fixed by KFD, so that we could pass the pattern as SVM.
- gpu().releaseGpuMemoryFence();
-
- dstMemory->release();
-
- return result;
-}
-
-bool
-KernelBlitManager::readBufferRect(
- device::Memory& srcMemory,
- void* dstHost,
- const amd::BufferRect& bufRect,
- const amd::BufferRect& hostRect,
- const amd::Coord3D& size,
- bool entire) const
-{
- // if (setup_.disableReadBufferRect_ || srcMemory.isHostMemDirectAccess()) {
- //return device::HostBlitManager::readBufferRect(
- // srcMemory, dstHost, bufRect, hostRect, size, entire);
- // }
-
- // Exercise HSA path for now.
- return HsaBlitManager::readBufferRect(
- srcMemory, dstHost, bufRect, hostRect, size, entire);
-
- size_t dstSize = hostRect.start_ + hostRect.end_;
- amd::Buffer *dstMemory =
- new (*context_) amd::Buffer(*context_, CL_MEM_USE_HOST_PTR, dstSize);
-
- if (!dstMemory->create(const_cast(dstHost))) {
- LogError("[OCL] Fail to create mem object for destination");
- return false;
- }
-
- device::Memory *devDstMemory = dstMemory->getDeviceMemory(dev_);
- if (devDstMemory== NULL) {
- LogError("[OCL] Fail to create device mem object for destination");
- return false;
- }
-
- bool result = copyBufferRect(
- srcMemory, *devDstMemory, bufRect, hostRect, size, entire);
-
- // Wait for the transfer to finish so that we could safely release the
- // destination memory object.
- // TODO: we could remove this if issue on implicit memory registration is
- // fixed by KFD, so that we could pass the pattern as SVM.
- gpu().releaseGpuMemoryFence();
-
- dstMemory->release();
-
- return result;
-}
-
-void
-FindLinearSize(
- size_t& linearSize, const amd::Coord3D& size,
- size_t& rowPitch, size_t& slicePitch, const device::Memory& mem)
-{
- const roc::Image &image = static_cast(mem);
- size_t elementSize = mem.owner()->asImage()->getImageFormat().getElementSize();
-
- linearSize = size[0] * elementSize;
- if ((rowPitch == 0) || (rowPitch == linearSize)) {
- rowPitch = 0;
- }
- else {
- linearSize = rowPitch;
- }
-
- // Calculate the pin size, which should be equal to the copy size
- for (uint i = 1; i < mem.owner()->asImage()->getDims(); ++i) {
- linearSize *= size[i];
- if (i == 1) {
- if ((slicePitch == 0) || (slicePitch == linearSize)) {
- slicePitch = 0;
- }
- else {
- if (mem.owner()->getType() != CL_MEM_OBJECT_IMAGE1D_ARRAY) {
- linearSize = slicePitch;
- }
- else {
- linearSize = slicePitch * size[i];
- }
- }
- }
- }
-}
-
-// The following data structures will be used for the view creations.
-// Some formats has to be converted before a kernel blit operation
-struct FormatConvertion {
- cl_uint clOldType_;
- cl_uint clNewType_;
-};
-
-// The list of rejected data formats and corresponding conversion
-static const FormatConvertion RejectedData[] =
-{
- { CL_UNORM_INT8, CL_UNSIGNED_INT8 },
- { CL_UNORM_INT16, CL_UNSIGNED_INT16 },
- { CL_SNORM_INT8, CL_UNSIGNED_INT8 },
- { CL_SNORM_INT16, CL_UNSIGNED_INT16 },
- { CL_HALF_FLOAT, CL_UNSIGNED_INT16 },
- { CL_FLOAT, CL_UNSIGNED_INT32 },
- { CL_SIGNED_INT8, CL_UNSIGNED_INT8 },
- { CL_SIGNED_INT16, CL_UNSIGNED_INT16 },
- { CL_UNORM_INT_101010, CL_UNSIGNED_INT8 },
- { CL_SIGNED_INT32, CL_UNSIGNED_INT32 }
-};
-
-// The list of rejected channel's order and corresponding conversion
-static const FormatConvertion RejectedOrder[] =
-{
- { CL_A, CL_R },
- { CL_RA, CL_RG },
- { CL_LUMINANCE, CL_R },
- { CL_INTENSITY, CL_R },
- { CL_RGB, CL_RGBA },
- { CL_BGRA, CL_RGBA },
- { CL_ARGB, CL_RGBA },
- { CL_sRGB, CL_RGBA },
- { CL_sRGBx, CL_RGBA },
- { CL_sRGBA, CL_RGBA },
- { CL_sBGRA, CL_RGBA },
- { CL_DEPTH, CL_R}
-};
-
-const uint RejectedFormatDataTotal =
- sizeof(RejectedData) / sizeof(FormatConvertion);
-const uint RejectedFormatChannelTotal =
- sizeof(RejectedOrder) / sizeof(FormatConvertion);
-
-amd::Image::Format
-KernelBlitManager::filterFormat(amd::Image::Format oldFormat) const
-{
- cl_image_format newFormat;
- newFormat.image_channel_data_type = oldFormat.image_channel_data_type;
- newFormat.image_channel_order = oldFormat.image_channel_order;
+ bool rejected = false;
+ Memory* dstView = &gpuMem(dstMemory);
+ bool releaseView = false;
+ bool result = false;
+ amd::Image::Format newFormat(gpuMem(dstMemory).owner()->asImage()->getImageFormat());
// Find unsupported formats
for (uint i = 0; i < RejectedFormatDataTotal; ++i) {
- if (RejectedData[i].clOldType_ == oldFormat.image_channel_data_type) {
+ if (RejectedData[i].clOldType_ == newFormat.image_channel_data_type) {
newFormat.image_channel_data_type = RejectedData[i].clNewType_;
+ rejected = true;
break;
}
}
// Find unsupported channel's order
for (uint i = 0; i < RejectedFormatChannelTotal; ++i) {
- if (RejectedOrder[i].clOldType_ == oldFormat.image_channel_order) {
+ if (RejectedOrder[i].clOldType_ == newFormat.image_channel_order) {
newFormat.image_channel_order = RejectedOrder[i].clNewType_;
+ rejected = true;
break;
}
}
- return amd::Image::Format(newFormat);
+ // If the image format was rejected, then attempt to create a view
+ if (rejected &&
+ // todo ROC runtime has a problem with a view for this format
+ (gpuMem(dstMemory).owner()->asImage()->
+ getImageFormat().image_channel_data_type != CL_UNORM_INT_101010)) {
+ dstView = createView(gpuMem(dstMemory), newFormat);
+ if (dstView != NULL) {
+ rejected = false;
+ releaseView = true;
+ }
+ }
+
+ // Fall into the host path if the image format was rejected
+ if (rejected) {
+ return DmaBlitManager::copyBufferToImage(
+ srcMemory, dstMemory, srcOrigin, dstOrigin, size, entire);
+ }
+
+ // Use a common blit type with three dimensions by default
+ uint blitType = BlitCopyBufferToImage;
+ size_t dim = 0;
+ size_t globalWorkOffset[3] = { 0, 0, 0 };
+ size_t globalWorkSize[3];
+ size_t localWorkSize[3];
+
+ // Program the kernels workload depending on the blit dimensions
+ dim = 3;
+ if (dstMemory.owner()->asImage()->getDims() == 1) {
+ globalWorkSize[0] = amd::alignUp(size[0], 256);
+ globalWorkSize[1] = amd::alignUp(size[1], 1);
+ globalWorkSize[2] = amd::alignUp(size[2], 1);
+ localWorkSize[0] = 256;
+ localWorkSize[1] = localWorkSize[2] = 1;
+ }
+ else if (dstMemory.owner()->asImage()->getDims() == 2) {
+ globalWorkSize[0] = amd::alignUp(size[0], 16);
+ globalWorkSize[1] = amd::alignUp(size[1], 16);
+ globalWorkSize[2] = amd::alignUp(size[2], 1);
+ localWorkSize[0] = localWorkSize[1] = 16;
+ localWorkSize[2] = 1;
+ }
+ else {
+ globalWorkSize[0] = amd::alignUp(size[0], 8);
+ globalWorkSize[1] = amd::alignUp(size[1], 8);
+ globalWorkSize[2] = amd::alignUp(size[2], 4);
+ localWorkSize[0] = localWorkSize[1] = 8;
+ localWorkSize[2] = 4;
+ }
+
+ // Program kernels arguments for the blit operation
+ cl_mem mem = as_cl(srcMemory.owner());
+ setArgument(kernels_[blitType], 0, sizeof(cl_mem), &mem);
+ mem = as_cl(dstView->owner());
+ setArgument(kernels_[blitType], 1, sizeof(cl_mem), &mem);
+ uint32_t memFmtSize = dstMemory.owner()->asImage()->getImageFormat().getElementSize();
+ uint32_t components = dstMemory.owner()->asImage()->getImageFormat().getNumChannels();
+
+ // 1 element granularity for writes by default
+ cl_int granularity = 1;
+ if (memFmtSize == 2) {
+ granularity = 2;
+ }
+ else if (memFmtSize >= 4) {
+ granularity = 4;
+ }
+ CondLog(((srcOrigin[0] % granularity) != 0), "Unaligned offset in blit!");
+ cl_ulong srcOrg[4] = { srcOrigin[0] / granularity,
+ srcOrigin[1],
+ srcOrigin[2], 0 };
+ setArgument(kernels_[blitType], 2, sizeof(srcOrg), srcOrg);
+
+ cl_int dstOrg[4] = { (cl_int)dstOrigin[0],
+ (cl_int)dstOrigin[1],
+ (cl_int)dstOrigin[2], 0 };
+ cl_int copySize[4] = { (cl_int)size[0],
+ (cl_int)size[1],
+ (cl_int)size[2], 0 };
+
+ setArgument(kernels_[blitType], 3, sizeof(dstOrg), dstOrg);
+ setArgument(kernels_[blitType], 4, sizeof(copySize), copySize);
+
+ // Program memory format
+ uint multiplier = memFmtSize / sizeof(uint32_t);
+ multiplier = (multiplier == 0) ? 1 : multiplier;
+ cl_uint format[4] = { components,
+ memFmtSize / components,
+ multiplier, 0 };
+ setArgument(kernels_[blitType], 5, sizeof(format), format);
+
+ // Program row and slice pitches
+ cl_ulong pitch[4] = { 0 };
+ CalcRowSlicePitches(pitch, copySize, rowPitch, slicePitch, gpuMem(dstMemory));
+ setArgument(kernels_[blitType], 6, sizeof(pitch), pitch);
+
+ // Create ND range object for the kernel's execution
+ amd::NDRangeContainer ndrange(dim,
+ globalWorkOffset, globalWorkSize, localWorkSize);
+
+ // Execute the blit
+ address parameters = kernels_[blitType]->parameters().capture(dev());
+ result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], parameters, NULL);
+ kernels_[blitType]->parameters().release(const_cast(parameters), dev());
+
+ if (releaseView) {
+ // todo SRD programming could be changed to avoid a stall
+ gpu().releaseGpuMemoryFence();
+ dstView->owner()->release();
+ }
+
+ return result;
}
-device::Memory *
-KernelBlitManager::createImageView(
- device::Memory &parent,
- amd::Image::Format newFormat) const
+bool
+KernelBlitManager::copyImageToBuffer(
+ device::Memory& srcMemory,
+ device::Memory& dstMemory,
+ const amd::Coord3D& srcOrigin,
+ const amd::Coord3D& dstOrigin,
+ const amd::Coord3D& size,
+ bool entire,
+ size_t rowPitch,
+ size_t slicePitch) const
{
- amd::Image *image =
- parent.owner()->asImage()->createView(parent.owner()->getContext(), newFormat, &gpu());
+ amd::ScopedLock k(lockXferOps_);
+ bool result = false;
+ static const bool CopyRect = false;
+ // Flush DMA for ASYNC copy
+ static const bool FlushDMA = true;
+ size_t imgRowPitch = size[0] * gpuMem(srcMemory).owner()->asImage()->getImageFormat().getElementSize();
+ size_t imgSlicePitch = imgRowPitch * size[1];
- if (image == NULL) {
- LogError("[OCL] Fail to allocate view of image object");
- return NULL;
+ if (setup_.disableCopyImageToBuffer_) {
+ result = HostBlitManager::copyImageToBuffer(
+ srcMemory, dstMemory, srcOrigin, dstOrigin,
+ size, entire, rowPitch, slicePitch);
+ synchronize();
+ return result;
+ }
+ // Check if buffer is in system memory with direct access
+ else if (gpuMem(dstMemory).isHostMemDirectAccess() &&
+ (((rowPitch == 0) && (slicePitch == 0)) ||
+ ((rowPitch == imgRowPitch) &&
+ ((slicePitch == 0) || (slicePitch == imgSlicePitch))))) {
+ // First attempt to do this all with DMA,
+ // but there are restriciton with older hardware
+ // If the dest buffer is external physical(SDI), copy two step as
+ // single step SDMA is causing corruption and the cause is under investigation
+ if (dev().settings().imageDMA_) {
+ result = DmaBlitManager::copyImageToBuffer(
+ srcMemory, dstMemory, srcOrigin, dstOrigin,
+ size, entire, rowPitch, slicePitch);
+ if (result) {
+ synchronize();
+ return result;
+ }
+ }
}
- Image* devImage = new roc::Image(static_cast(dev_), *image);
- if (devImage == NULL) {
- LogError("[OCL] Fail to allocate device mem object for the view");
- image->release();
- return NULL;
+ if (!result) {
+ result = copyImageToBufferKernel(srcMemory,
+ dstMemory, srcOrigin, dstOrigin, size, entire, rowPitch, slicePitch);
}
- if (!devImage->createView(static_cast(parent))) {
- LogError("[OCL] Fail to create device mem object for the view");
- delete devImage;
- image->release();
- return NULL;
+ synchronize();
+
+ return result;
+}
+
+bool
+KernelBlitManager::copyImageToBufferKernel(
+ device::Memory& srcMemory,
+ device::Memory& dstMemory,
+ const amd::Coord3D& srcOrigin,
+ const amd::Coord3D& dstOrigin,
+ const amd::Coord3D& size,
+ bool entire,
+ size_t rowPitch,
+ size_t slicePitch) const
+{
+ bool rejected = false;
+ Memory* srcView = &gpuMem(srcMemory);
+ bool releaseView = false;
+ bool result = false;
+ amd::Image::Format newFormat(gpuMem(srcMemory).owner()->asImage()->getImageFormat());
+
+ // Find unsupported formats
+ for (uint i = 0; i < RejectedFormatDataTotal; ++i) {
+ if (RejectedData[i].clOldType_ == newFormat.image_channel_data_type) {
+ newFormat.image_channel_data_type = RejectedData[i].clNewType_;
+ rejected = true;
+ break;
+ }
}
- image->replaceDeviceMemory(&dev_, devImage);
+ // Find unsupported channel's order
+ for (uint i = 0; i < RejectedFormatChannelTotal; ++i) {
+ if (RejectedOrder[i].clOldType_ == newFormat.image_channel_order) {
+ newFormat.image_channel_order = RejectedOrder[i].clNewType_;
+ rejected = true;
+ break;
+ }
+ }
- return devImage;
+ // If the image format was rejected, then attempt to create a view
+ if (rejected &&
+ // todo ROC runtime has a problem with a view for this format
+ (gpuMem(srcMemory).owner()->asImage()->
+ getImageFormat().image_channel_data_type != CL_UNORM_INT_101010)) {
+ srcView = createView(gpuMem(srcMemory), newFormat);
+ if (srcView != NULL) {
+ rejected = false;
+ releaseView = true;
+ }
+ }
+
+ // Fall into the host path if the image format was rejected
+ if (rejected) {
+ return DmaBlitManager::copyImageToBuffer(
+ srcMemory, dstMemory, srcOrigin, dstOrigin, size, entire);
+ }
+
+ uint blitType = BlitCopyImageToBuffer;
+ size_t dim = 0;
+ size_t globalWorkOffset[3] = { 0, 0, 0 };
+ size_t globalWorkSize[3];
+ size_t localWorkSize[3];
+
+ // Program the kernels workload depending on the blit dimensions
+ dim = 3;
+ // Find the current blit type
+ if (srcMemory.owner()->asImage()->getDims() == 1) {
+ globalWorkSize[0] = amd::alignUp(size[0], 256);
+ globalWorkSize[1] = amd::alignUp(size[1], 1);
+ globalWorkSize[2] = amd::alignUp(size[2], 1);
+ localWorkSize[0] = 256;
+ localWorkSize[1] = localWorkSize[2] = 1;
+ }
+ else if (srcMemory.owner()->asImage()->getDims() == 2) {
+ globalWorkSize[0] = amd::alignUp(size[0], 16);
+ globalWorkSize[1] = amd::alignUp(size[1], 16);
+ globalWorkSize[2] = amd::alignUp(size[2], 1);
+ localWorkSize[0] = localWorkSize[1] = 16;
+ localWorkSize[2] = 1;
+ }
+ else {
+ globalWorkSize[0] = amd::alignUp(size[0], 8);
+ globalWorkSize[1] = amd::alignUp(size[1], 8);
+ globalWorkSize[2] = amd::alignUp(size[2], 4);
+ localWorkSize[0] = localWorkSize[1] = 8;
+ localWorkSize[2] = 4;
+ }
+
+ // Program kernels arguments for the blit operation
+ cl_mem mem = as_cl(srcView->owner());
+ setArgument(kernels_[blitType], 0, sizeof(cl_mem), &mem);
+ mem = as_cl(dstMemory.owner());
+ setArgument(kernels_[blitType], 1, sizeof(cl_mem), &mem);
+
+ // Update extra paramters for USHORT and UBYTE pointers.
+ // Only then compiler can optimize the kernel to use
+ // UAV Raw for other writes
+ setArgument(kernels_[blitType], 2, sizeof(cl_mem), &mem);
+ setArgument(kernels_[blitType], 3, sizeof(cl_mem), &mem);
+
+ cl_int srcOrg[4] = { (cl_int)srcOrigin[0],
+ (cl_int)srcOrigin[1],
+ (cl_int)srcOrigin[2], 0 };
+ cl_int copySize[4] = { (cl_int)size[0],
+ (cl_int)size[1],
+ (cl_int)size[2], 0 };
+ setArgument(kernels_[blitType], 4, sizeof(srcOrg), srcOrg);
+ uint32_t memFmtSize = srcMemory.owner()->asImage()->getImageFormat().getElementSize();
+ uint32_t components = srcMemory.owner()->asImage()->getImageFormat().getNumChannels();
+
+ // 1 element granularity for writes by default
+ cl_int granularity = 1;
+ if (memFmtSize == 2) {
+ granularity = 2;
+ }
+ else if (memFmtSize >= 4) {
+ granularity = 4;
+ }
+ CondLog(((dstOrigin[0] % granularity) != 0), "Unaligned offset in blit!");
+ cl_ulong dstOrg[4] = { dstOrigin[0] / granularity,
+ dstOrigin[1],
+ dstOrigin[2], 0 };
+ setArgument(kernels_[blitType], 5, sizeof(dstOrg), dstOrg);
+ setArgument(kernels_[blitType], 6, sizeof(copySize), copySize);
+
+ // Program memory format
+ uint multiplier = memFmtSize / sizeof(uint32_t);
+ multiplier = (multiplier == 0) ? 1 : multiplier;
+ cl_uint format[4] = { components,
+ memFmtSize / components,
+ multiplier, 0 };
+ setArgument(kernels_[blitType], 7, sizeof(format), format);
+
+ // Program row and slice pitches
+ cl_ulong pitch[4] = { 0 };
+ CalcRowSlicePitches(pitch, copySize, rowPitch, slicePitch, gpuMem(srcMemory));
+ setArgument(kernels_[blitType], 8, sizeof(pitch), pitch);
+
+ // Create ND range object for the kernel's execution
+ amd::NDRangeContainer ndrange(dim,
+ globalWorkOffset, globalWorkSize, localWorkSize);
+
+ // Execute the blit
+ address parameters = kernels_[blitType]->parameters().capture(dev());
+ result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], parameters, NULL);
+ kernels_[blitType]->parameters().release(const_cast(parameters), dev());
+ if (releaseView) {
+ // todo SRD programming could be changed to avoid a stall
+ gpu().releaseGpuMemoryFence();
+ srcView->owner()->release();
+ }
+
+ return result;
+}
+
+bool
+KernelBlitManager::copyImage(
+ device::Memory& srcMemory,
+ device::Memory& dstMemory,
+ const amd::Coord3D& srcOrigin,
+ const amd::Coord3D& dstOrigin,
+ const amd::Coord3D& size,
+ bool entire) const
+{
+ amd::ScopedLock k(lockXferOps_);
+ bool rejected = false;
+ Memory* srcView = &gpuMem(srcMemory);
+ Memory* dstView = &gpuMem(dstMemory);
+ bool releaseView = false;
+ bool result = false;
+ amd::Image::Format newFormat(gpuMem(srcMemory).owner()->asImage()->getImageFormat());
+
+ // Find unsupported formats
+ for (uint i = 0; i < RejectedFormatDataTotal; ++i) {
+ if (RejectedData[i].clOldType_ == newFormat.image_channel_data_type) {
+ newFormat.image_channel_data_type = RejectedData[i].clNewType_;
+ rejected = true;
+ break;
+ }
+ }
+
+ // Search for the rejected channel's order only if the format was rejected
+ // Note: Image blit is independent from the channel order
+ if (rejected) {
+ for (uint i = 0; i < RejectedFormatChannelTotal; ++i) {
+ if (RejectedOrder[i].clOldType_ == newFormat.image_channel_order) {
+ newFormat.image_channel_order = RejectedOrder[i].clNewType_;
+ rejected = true;
+ break;
+ }
+ }
+ }
+
+ // Attempt to create a view if the format was rejected
+ if (rejected) {
+ srcView = createView(gpuMem(srcMemory), newFormat);
+ if (srcView != NULL) {
+ dstView = createView(gpuMem(dstMemory), newFormat);
+ if (dstView != NULL) {
+ rejected = false;
+ releaseView = true;
+ }
+ else {
+ delete srcView;
+ }
+ }
+ }
+
+ // Fall into the host path for the entire 2D copy or
+ // if the image format was rejected
+ if (rejected) {
+ result = HostBlitManager::copyImage(srcMemory, dstMemory,
+ srcOrigin, dstOrigin, size, entire);
+ synchronize();
+ return result;
+ }
+
+ uint blitType = BlitCopyImage;
+ size_t dim = 0;
+ size_t globalWorkOffset[3] = { 0, 0, 0 };
+ size_t globalWorkSize[3];
+ size_t localWorkSize[3];
+
+ // Program the kernels workload depending on the blit dimensions
+ dim = 3;
+ // Find the current blit type
+ if ((srcMemory.owner()->asImage()->getDims() == 1) ||
+ (dstMemory.owner()->asImage()->getDims() == 1)) {
+ globalWorkSize[0] = amd::alignUp(size[0], 256);
+ globalWorkSize[1] = amd::alignUp(size[1], 1);
+ globalWorkSize[2] = amd::alignUp(size[2], 1);
+ localWorkSize[0] = 256;
+ localWorkSize[1] = localWorkSize[2] = 1;
+ }
+ else if ((srcMemory.owner()->asImage()->getDims() == 2) ||
+ (dstMemory.owner()->asImage()->getDims() == 2)) {
+ globalWorkSize[0] = amd::alignUp(size[0], 16);
+ globalWorkSize[1] = amd::alignUp(size[1], 16);
+ globalWorkSize[2] = amd::alignUp(size[2], 1);
+ localWorkSize[0] = localWorkSize[1] = 16;
+ localWorkSize[2] = 1;
+ }
+ else {
+ globalWorkSize[0] = amd::alignUp(size[0], 8);
+ globalWorkSize[1] = amd::alignUp(size[1], 8);
+ globalWorkSize[2] = amd::alignUp(size[2], 4);
+ localWorkSize[0] = localWorkSize[1] = 8;
+ localWorkSize[2] = 4;
+ }
+
+ // The current OpenCL spec allows "copy images from a 1D image
+ // array object to a 1D image array object" only.
+ if ((gpuMem(srcMemory).owner()->getType() == CL_MEM_OBJECT_IMAGE1D_ARRAY) ||
+ (gpuMem(dstMemory).owner()->getType() == CL_MEM_OBJECT_IMAGE1D_ARRAY)) {
+ blitType = BlitCopyImage1DA;
+ }
+
+ // Program kernels arguments for the blit operation
+ cl_mem mem = as_cl(srcView->owner());
+ setArgument(kernels_[blitType], 0, sizeof(cl_mem), &mem);
+ mem = as_cl(dstView->owner());
+ setArgument(kernels_[blitType], 1, sizeof(cl_mem), &mem);
+
+ // Program source origin
+ cl_int srcOrg[4] = { (cl_int)srcOrigin[0],
+ (cl_int)srcOrigin[1],
+ (cl_int)srcOrigin[2], 0 };
+ setArgument(kernels_[blitType], 2, sizeof(srcOrg), srcOrg);
+
+ // Program destinaiton origin
+ cl_int dstOrg[4] = { (cl_int)dstOrigin[0],
+ (cl_int)dstOrigin[1],
+ (cl_int)dstOrigin[2], 0 };
+ setArgument(kernels_[blitType], 3, sizeof(dstOrg), dstOrg);
+
+ cl_int copySize[4] = { (cl_int)size[0],
+ (cl_int)size[1],
+ (cl_int)size[2], 0 };
+ setArgument(kernels_[blitType], 4, sizeof(copySize), copySize);
+
+ // Create ND range object for the kernel's execution
+ amd::NDRangeContainer ndrange(dim,
+ globalWorkOffset, globalWorkSize, localWorkSize);
+
+ // Execute the blit
+ address parameters = kernels_[blitType]->parameters().capture(dev());
+ result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], parameters, NULL);
+ kernels_[blitType]->parameters().release(const_cast(parameters), dev());
+ if (releaseView) {
+ // todo SRD programming could be changed to avoid a stall
+ gpu().releaseGpuMemoryFence();
+ srcView->owner()->release();
+ dstView->owner()->release();
+ }
+
+ synchronize();
+
+ return result;
+}
+
+void
+FindPinSize(
+ size_t& pinSize, const amd::Coord3D& size,
+ size_t& rowPitch, size_t& slicePitch, const Memory& mem)
+{
+ pinSize = size[0] * mem.owner()->asImage()->getImageFormat().getElementSize();
+ if ((rowPitch == 0) || (rowPitch == pinSize)) {
+ rowPitch = 0;
+ }
+ else {
+ pinSize = rowPitch;
+ }
+
+ // Calculate the pin size, which should be equal to the copy size
+ for (uint i = 1; i < mem.owner()->asImage()->getDims(); ++i) {
+ pinSize *= size[i];
+ if (i == 1) {
+ if ((slicePitch == 0) || (slicePitch == pinSize)) {
+ slicePitch = 0;
+ }
+ else {
+ if (mem.owner()->getType() != CL_MEM_OBJECT_IMAGE1D_ARRAY) {
+ pinSize = slicePitch;
+ }
+ else {
+ pinSize = slicePitch * size[i];
+ }
+ }
+ }
+ }
}
bool
KernelBlitManager::readImage(
- device::Memory& srcMemory,
+ device::Memory& srcMemory,
void* dstHost,
const amd::Coord3D& origin,
const amd::Coord3D& size,
@@ -810,98 +1559,47 @@ KernelBlitManager::readImage(
size_t slicePitch,
bool entire) const
{
- return HsaBlitManager::readImage(
- srcMemory, dstHost, origin, size, rowPitch, slicePitch, entire);
-}
+ amd::ScopedLock k(lockXferOps_);
+ bool result = false;
-bool
-KernelBlitManager::writeBuffer(
- const void* srcHost,
- device::Memory& dstMemory,
- const amd::Coord3D& origin,
- const amd::Coord3D& size,
- bool entire) const
-{
- // if (setup_.disableWriteBuffer_ || dstMemory.isHostMemDirectAccess()) {
- //return device::HostBlitManager::writeBuffer(srcHost, dstMemory, origin, size,
- // entire);
- // }
+ // Use host copy if memory has direct access
+ if (setup_.disableReadImage_ ||
+ (gpuMem(srcMemory).isHostMemDirectAccess())) {
+ result = HostBlitManager::readImage(srcMemory, dstHost,
+ origin, size, rowPitch, slicePitch, entire);
+ synchronize();
+ return result;
+ }
+ else {
+ size_t pinSize;
+ FindPinSize(pinSize, size, rowPitch, slicePitch, gpuMem(srcMemory));
- // Exercise HSA path for now.
- return HsaBlitManager::writeBuffer(srcHost, dstMemory, origin, size,
- entire);
+ size_t partial;
+ amd::Memory* amdMemory = pinHostMemory(dstHost, pinSize, partial);
- amd::Buffer *srcMemory =
- new (*context_) amd::Buffer(*context_, CL_MEM_USE_HOST_PTR, size[0]);
+ if (amdMemory == NULL) {
+ // Force SW copy
+ result = HostBlitManager::readImage(srcMemory, dstHost,
+ origin, size, rowPitch, slicePitch, entire);
+ synchronize();
+ return result;
+ }
- if (!srcMemory->create(const_cast(srcHost))) {
- LogError("[OCL] Fail to create mem object for destination");
- return false;
+ // Readjust destination offset
+ const amd::Coord3D dstOrigin(partial);
+
+ // Get device memory for this virtual device
+ Memory* dstMemory = dev().getRocMemory(amdMemory);
+
+ // Copy image to buffer
+ result = copyImageToBuffer(srcMemory, *dstMemory,
+ origin, dstOrigin, size, entire, rowPitch, slicePitch);
+
+ // Add pinned memory for a later release
+ gpu().addPinnedMem(amdMemory);
}
- device::Memory *devSrcMemory = srcMemory->getDeviceMemory(dev_);
- if (devSrcMemory== NULL) {
- LogError("[OCL] Fail to create device mem object for destination");
- return false;
- }
-
- bool result =
- copyBuffer(*devSrcMemory, dstMemory, amd::Coord3D(0), origin, size, entire);
-
- // Wait for the transfer to finish so that we could safely release the
- // source memory object.
- // TODO: we could remove this if issue on implicit memory registration is
- // fixed by KFD, so that we could pass the pattern as SVM.
- gpu().releaseGpuMemoryFence();
-
- srcMemory->release();
-
- return result;
-}
-
-bool
-KernelBlitManager::writeBufferRect(
- const void* srcHost,
- device::Memory& dstMemory,
- const amd::BufferRect& hostRect,
- const amd::BufferRect& bufRect,
- const amd::Coord3D& size,
- bool entire) const
-{
- // if (setup_.disableWriteBufferRect_ || dstMemory.isHostMemDirectAccess()) {
- //return device::HostBlitManager::writeBufferRect(
- // srcHost, dstMemory, hostRect, bufRect, size, entire);
- // }
-
- // Exercise HSA path for now.
- return HsaBlitManager::writeBufferRect(
- srcHost, dstMemory, hostRect, bufRect, size, entire);
-
- size_t srcSize = hostRect.start_ + hostRect.end_;
- amd::Buffer *srcMemory =
- new (*context_) amd::Buffer(*context_, CL_MEM_USE_HOST_PTR, srcSize);
-
- if (!srcMemory->create(const_cast(srcHost))) {
- LogError("[OCL] Fail to create mem object for destination");
- return false;
- }
-
- device::Memory *devSrcMemory = srcMemory->getDeviceMemory(dev_);
- if (devSrcMemory== NULL) {
- LogError("[OCL] Fail to create device mem object for destination");
- return false;
- }
-
- bool result = copyBufferRect(
- *devSrcMemory, dstMemory, hostRect, bufRect, size, entire);
-
- // Wait for the transfer to finish so that we could safely release the
- // destination memory object.
- // TODO: we could remove this if issue on implicit memory registration is
- // fixed by KFD, so that we could pass the pattern as SVM.
- gpu().releaseGpuMemoryFence();
-
- srcMemory->release();
+ synchronize();
return result;
}
@@ -909,132 +1607,81 @@ KernelBlitManager::writeBufferRect(
bool
KernelBlitManager::writeImage(
const void* srcHost,
- device::Memory& dstMemory,
+ device::Memory& dstMemory,
const amd::Coord3D& origin,
const amd::Coord3D& size,
size_t rowPitch,
size_t slicePitch,
bool entire) const
{
- return HsaBlitManager::writeImage(
- srcHost, dstMemory, origin, size, rowPitch, slicePitch, entire);
-}
+ amd::ScopedLock k(lockXferOps_);
+ bool result = false;
-bool
-KernelBlitManager::copyBuffer(
- device::Memory& srcMemory,
- device::Memory& dstMemory,
- const amd::Coord3D& srcOrigin,
- const amd::Coord3D& dstOrigin,
- const amd::Coord3D& sizeIn,
- bool entire) const
-{
- // if (setup_.disableCopyBuffer_ ||
- // (srcMemory.isHostMemDirectAccess() &&
- // dstMemory.isHostMemDirectAccess())) {
- //return HsaBlitManager::copyBuffer(
- // srcMemory, dstMemory, srcOrigin, dstOrigin, sizeIn, entire);
- // }
+ // Use host copy if memory has direct access
+ if (setup_.disableWriteImage_|| gpuMem(dstMemory).isHostMemDirectAccess()) {
+ result = HostBlitManager::writeImage(
+ srcHost, dstMemory, origin, size, rowPitch, slicePitch, entire);
+ synchronize();
+ return result;
+ }
+ else {
+ size_t pinSize;
+ FindPinSize(pinSize, size, rowPitch, slicePitch, gpuMem(dstMemory));
- // Exercise HSA path for now.
- return HsaBlitManager::copyBuffer(
- srcMemory, dstMemory, srcOrigin, dstOrigin, sizeIn, entire);
+ size_t partial;
+ amd::Memory* amdMemory = pinHostMemory(srcHost, pinSize, partial);
- uint blitType = BlitCopyBuffer;
- size_t dim = 1;
- size_t globalWorkOffset[3] = { 0, 0, 0 };
- size_t globalWorkSize = 0;
- size_t localWorkSize = 0;
-
- const static uint CopyBuffAlignment[3] = { 16, 4, 1 };
- amd::Coord3D size(sizeIn[0], sizeIn[1], sizeIn[2]);
-
- bool aligned;
- uint i;
- for (i = 0; i < 3; ++i) {
- // Check source alignments
- aligned = ((srcOrigin[0] % CopyBuffAlignment[i]) == 0);
- // Check destination alignments
- aligned &= ((dstOrigin[0] % CopyBuffAlignment[i]) == 0);
- // Check copy size alignment in the first dimension
- aligned &= ((sizeIn[0] % CopyBuffAlignment[i]) == 0);
-
- if (aligned) {
- if (CopyBuffAlignment[i] != 1) {
- blitType = BlitCopyBufferAligned;
- }
- break;
+ if (amdMemory == NULL) {
+ // Force SW copy
+ result = HostBlitManager::writeImage(
+ srcHost, dstMemory, origin, size, rowPitch, slicePitch, entire);
+ synchronize();
+ return result;
}
+
+ // Readjust destination offset
+ const amd::Coord3D srcOrigin(partial);
+
+ // Get device memory for this virtual device
+ Memory* srcMemory = dev().getRocMemory(amdMemory);
+
+ // Copy image to buffer
+ result = copyBufferToImage(*srcMemory, dstMemory,
+ srcOrigin, origin, size, entire, rowPitch, slicePitch);
+
+ // Add pinned memory for a later release
+ gpu().addPinnedMem(amdMemory);
}
- cl_uint remain;
- if (blitType == BlitCopyBufferAligned) {
- size.c[0] /= CopyBuffAlignment[i];
- }
- else {
- remain = size[0] % 4;
- size.c[0] /= 4;
- size.c[0] += 1;
- }
+ synchronize();
- // Program the dispatch dimensions
- localWorkSize = 256;
- globalWorkSize = amd::alignUp(size[0] , 256);
-
- // Program kernels arguments for the blit operation
- cl_mem clmem = ((cl_mem) as_cl(srcMemory.owner()));
- kernels_[blitType]->parameters().set(0, sizeof(cl_mem), &clmem);
- clmem = ((cl_mem) as_cl(dstMemory.owner()));
- kernels_[blitType]->parameters().set(1, sizeof(cl_mem), &clmem);
- // Program source origin
- cl_ulong srcOffset = srcOrigin[0] / CopyBuffAlignment[i];
- kernels_[blitType]->parameters().set(2, sizeof(srcOffset), &srcOffset);
-
- // Program destination origin
- cl_ulong dstOffset = dstOrigin[0] / CopyBuffAlignment[i];
- kernels_[blitType]->parameters().set(3, sizeof(dstOffset), &dstOffset);
-
- cl_ulong copySize = size[0];
- kernels_[blitType]->parameters().set(4, sizeof(copySize), ©Size);
-
- if (blitType == BlitCopyBufferAligned) {
- cl_int alignment = CopyBuffAlignment[i];
- kernels_[blitType]->parameters().set(5, sizeof(alignment), &alignment);
- }
- else {
- kernels_[blitType]->parameters().set(5, sizeof(remain), &remain);
- }
-
- // Create ND range object for the kernel's execution
- amd::NDRangeContainer ndrange(
- 1, globalWorkOffset, &globalWorkSize, &localWorkSize);
-
- // Execute the blit
- address parameters = kernels_[blitType]->parameters().capture(dev_);
- bool result = gpu().submitKernelInternal(
- ndrange, *kernels_[blitType], parameters, NULL);
- kernels_[blitType]->parameters().release(const_cast(parameters), dev_);
return result;
}
bool
KernelBlitManager::copyBufferRect(
- device::Memory& srcMemory,
- device::Memory& dstMemory,
- const amd::BufferRect& srcRectIn,
- const amd::BufferRect& dstRectIn,
+ device::Memory& srcMemory,
+ device::Memory& dstMemory,
+ const amd::BufferRect& srcRectIn,
+ const amd::BufferRect& dstRectIn,
const amd::Coord3D& sizeIn,
bool entire) const
{
- // if (setup_.disableCopyBuffer_ ||
- // (srcMemory.isHostMemDirectAccess() && dstMemory.isHostMemDirectAccess())) {
- //return HsaBlitManager::copyBufferRect(
- // srcMemory, dstMemory, srcRectIn, dstRectIn, sizeIn, entire);
- // }
+ amd::ScopedLock k(lockXferOps_);
+ bool result = false;
+ bool rejected = false;
- // Exercise HSA path for now.
- return HsaBlitManager::copyBufferRect(
- srcMemory, dstMemory, srcRectIn, dstRectIn, sizeIn, entire);
+ // Fall into the ROC path for rejected transfers
+ if (setup_.disableCopyBufferRect_ ||
+ gpuMem(srcMemory).isHostMemDirectAccess() || gpuMem(dstMemory).isHostMemDirectAccess()) {
+ result = DmaBlitManager::copyBufferRect(srcMemory, dstMemory,
+ srcRectIn, dstRectIn, sizeIn, entire);
+
+ if (result) {
+ synchronize();
+ return result;
+ }
+ }
uint blitType = BlitCopyBufferRect;
size_t dim = 3;
@@ -1110,300 +1757,269 @@ KernelBlitManager::copyBufferRect(
// Program kernels arguments for the blit operation
- cl_mem clmem = ((cl_mem) as_cl(srcMemory.owner()));
- kernels_[blitType]->parameters().set(0, sizeof(cl_mem), &clmem);
- clmem = ((cl_mem) as_cl(dstMemory.owner()));
- kernels_[blitType]->parameters().set(1, sizeof(cl_mem), &clmem);
- cl_ulong src[4] = {srcRect.rowPitch_,
- srcRect.slicePitch_,
- srcRect.start_, 0 };
- kernels_[blitType]->parameters().set(2, sizeof(src), src);
- cl_ulong dst[4] = {dstRect.rowPitch_,
- dstRect.slicePitch_,
- dstRect.start_, 0 };
- kernels_[blitType]->parameters().set(3, sizeof(dst), dst);
- cl_ulong copySize[4] = {size[0],
- size[1],
- size[2],
- CopyRectAlignment[i] };
- kernels_[blitType]->parameters().set(4, sizeof(copySize), copySize);
+ cl_mem mem = as_cl(srcMemory.owner());
+ setArgument(kernels_[blitType], 0, sizeof(cl_mem), &mem);
+ mem = as_cl(dstMemory.owner());
+ setArgument(kernels_[blitType], 1, sizeof(cl_mem), &mem);
+ cl_ulong src[4] = { srcRect.rowPitch_,
+ srcRect.slicePitch_,
+ srcRect.start_, 0 };
+ setArgument(kernels_[blitType], 2, sizeof(src), src);
+ cl_ulong dst[4] = { dstRect.rowPitch_,
+ dstRect.slicePitch_,
+ dstRect.start_, 0 };
+ setArgument(kernels_[blitType], 3, sizeof(dst), dst);
+ cl_ulong copySize[4] = { size[0], size[1], size[2], CopyRectAlignment[i] };
+ setArgument(kernels_[blitType], 4, sizeof(copySize), copySize);
// Create ND range object for the kernel's execution
amd::NDRangeContainer ndrange(dim,
globalWorkOffset, globalWorkSize, localWorkSize);
// Execute the blit
- address parameters = kernels_[blitType]->parameters().capture(dev_);
- bool result = gpu().submitKernelInternal(
- ndrange, *kernels_[blitType], parameters, NULL);
- kernels_[blitType]->parameters().release(const_cast(parameters), dev_);
+ address parameters = kernels_[blitType]->parameters().capture(dev());
+ result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], parameters, NULL);
+ kernels_[blitType]->parameters().release(const_cast(parameters), dev());
+
+ synchronize();
+
return result;
}
bool
-KernelBlitManager::copyImageToBuffer(
- device::Memory& srcMemory,
- device::Memory& dstMemory,
- const amd::Coord3D& srcOrigin,
- const amd::Coord3D& dstOrigin,
- const amd::Coord3D& size,
- bool entire,
- size_t rowPitch,
- size_t slicePitch) const
-{
- if (dstMemory.isHostMemDirectAccess()) {
- return HsaBlitManager::copyImageToBuffer(srcMemory, dstMemory, srcOrigin,
- dstOrigin, size, entire, rowPitch,
- slicePitch);
- }
-
- amd::Image::Format oldFormat = srcMemory.owner()->asImage()->getImageFormat();
- amd::Image::Format newFormat = filterFormat(oldFormat);
- bool useView = false;
-
- device::Memory* srcView = &srcMemory;
- if (oldFormat != newFormat) {
- srcView = createImageView(srcMemory, newFormat);
- useView = true;
- }
-
- roc::Image& srcImage = static_cast(*srcView);
-
- amd::Image* image = srcImage.owner()->asImage();
- uint blitType = 0;
- blitType = BlitCopyImageToBuffer;
- size_t dim = 0;
- size_t globalWorkOffset[3] = {0, 0, 0};
- size_t globalWorkSize[3];
- size_t localWorkSize[3];
-
- // Program the kernels workload depending on the blit dimensions
- const size_t imageDims = srcImage.owner()->asImage()->getDims();
- dim = 3;
- // Find the current blit type
- if (imageDims == 1) {
- globalWorkSize[0] = amd::alignUp(size[0], 256);
- globalWorkSize[1] = amd::alignUp(size[1], 1);
- globalWorkSize[2] = amd::alignUp(size[2], 1);
- localWorkSize[0] = 256;
- localWorkSize[1] = localWorkSize[2] = 1;
- } else if (imageDims == 2) {
- globalWorkSize[0] = amd::alignUp(size[0], 16);
- globalWorkSize[1] = amd::alignUp(size[1], 16);
- globalWorkSize[2] = amd::alignUp(size[2], 1);
- localWorkSize[0] = localWorkSize[1] = 16;
- localWorkSize[2] = 1;
- } else {
- globalWorkSize[0] = amd::alignUp(size[0], 8);
- globalWorkSize[1] = amd::alignUp(size[1], 8);
- globalWorkSize[2] = amd::alignUp(size[2], 4);
- localWorkSize[0] = localWorkSize[1] = 8;
- localWorkSize[2] = 4;
- }
-
- // Program kernels arguments for the blit operation
- cl_mem clmem = ((cl_mem)as_cl(srcImage.owner()));
- kernels_[blitType]->parameters().set(0, sizeof(cl_mem), &clmem);
- clmem = ((cl_mem)as_cl(dstMemory.owner()));
- kernels_[blitType]->parameters().set(1, sizeof(cl_mem), &clmem);
-
- // Update extra paramters for USHORT and UBYTE pointers.
- // Only then compiler can optimize the kernel to use
- // UAV Raw for other writes
- kernels_[blitType]->parameters().set(2, sizeof(cl_mem), &clmem);
- kernels_[blitType]->parameters().set(3, sizeof(cl_mem), &clmem);
-
- cl_int srcOrg[4] = {(cl_int)srcOrigin[0], (cl_int)srcOrigin[1],
- (cl_int)srcOrigin[2], 0};
- cl_int copySize[4] = {(cl_int)size[0], (cl_int)size[1], (cl_int)size[2], 0};
-
- kernels_[blitType]->parameters().set(4, sizeof(srcOrg), srcOrg);
-
- const size_t elementSize =
- srcImage.owner()->asImage()->getImageFormat().getElementSize();
- const size_t numChannels =
- srcImage.owner()->asImage()->getImageFormat().getNumChannels();
-
- // 1 element granularity for writes by default
- cl_int granularity = 1;
- if (elementSize == 2) {
- granularity = 2;
- } else if (elementSize >= 4) {
- granularity = 4;
- }
- CondLog(((dstOrigin[0] % granularity) != 0), "Unaligned offset in blit!");
- cl_ulong dstOrg[4] = {dstOrigin[0] / granularity, dstOrigin[1], dstOrigin[2],
- 0};
- kernels_[blitType]->parameters().set(5, sizeof(dstOrg), dstOrg);
- kernels_[blitType]->parameters().set(6, sizeof(copySize), copySize);
-
- // Program memory format
- uint multiplier = elementSize / sizeof(uint32_t);
- multiplier = (multiplier == 0) ? 1 : multiplier;
- cl_uint format[4] = {(cl_uint)numChannels,
- (cl_uint)(elementSize / numChannels), multiplier, 0};
- kernels_[blitType]->parameters().set(7, sizeof(format), format);
-
- // Program row and slice pitches
- cl_ulong pitch[4] = {0};
- CalcRowSlicePitches(pitch, copySize, rowPitch, slicePitch, srcImage);
- kernels_[blitType]->parameters().set(8, sizeof(pitch), pitch);
-
- // Create ND range object for the kernel's execution
- amd::NDRangeContainer ndrange(dim, globalWorkOffset, globalWorkSize,
- localWorkSize);
-
- // Execute the blit
- address parameters = kernels_[blitType]->parameters().capture(dev_);
- bool result = gpu().submitKernelInternal(ndrange, *kernels_[blitType],
- parameters, NULL);
-
- gpu().releaseGpuMemoryFence();
-
- kernels_[blitType]->parameters().release(const_cast(parameters),
- dev_);
-
- if (useView) {
- srcView->owner()->release();
- }
-
- return result;
-}
-
-bool KernelBlitManager::copyBufferToImage(device::Memory& srcMemory,
- device::Memory& dstMemory,
- const amd::Coord3D& srcOrigin,
- const amd::Coord3D& dstOrigin,
- const amd::Coord3D& size, bool entire,
- size_t rowPitch,
- size_t slicePitch) const {
- if (srcMemory.isHostMemDirectAccess()) {
- return HsaBlitManager::copyBufferToImage(srcMemory, dstMemory, srcOrigin,
- dstOrigin, size, entire, rowPitch,
- slicePitch);
- }
-
- amd::Image::Format oldFormat = dstMemory.owner()->asImage()->getImageFormat();
- amd::Image::Format newFormat = filterFormat(oldFormat);
- bool useView = false;
-
- device::Memory* dstView = &dstMemory;
- if (oldFormat != newFormat) {
- dstView = createImageView(dstMemory, newFormat);
- useView = true;
- }
-
- roc::Image& dstImage = static_cast(*dstView);
-
- // Use a common blit type with three dimensions by default
- uint blitType = BlitCopyBufferToImage;
- size_t dim = 0;
- size_t globalWorkOffset[3] = {0, 0, 0};
- size_t globalWorkSize[3];
- size_t localWorkSize[3];
-
- // Program the kernels workload depending on the blit dimensions
- const size_t imageDims = dstImage.owner()->asImage()->getDims();
- dim = 3;
- if (imageDims == 1) {
- globalWorkSize[0] = amd::alignUp(size[0], 256);
- globalWorkSize[1] = amd::alignUp(size[1], 1);
- globalWorkSize[2] = amd::alignUp(size[2], 1);
- localWorkSize[0] = 256;
- localWorkSize[1] = localWorkSize[2] = 1;
- } else if (imageDims == 2) {
- globalWorkSize[0] = amd::alignUp(size[0], 16);
- globalWorkSize[1] = amd::alignUp(size[1], 16);
- globalWorkSize[2] = amd::alignUp(size[2], 1);
- localWorkSize[0] = localWorkSize[1] = 16;
- localWorkSize[2] = 1;
- } else {
- globalWorkSize[0] = amd::alignUp(size[0], 8);
- globalWorkSize[1] = amd::alignUp(size[1], 8);
- globalWorkSize[2] = amd::alignUp(size[2], 4);
- localWorkSize[0] = localWorkSize[1] = 8;
- localWorkSize[2] = 4;
- }
-
- // Program kernels arguments for the blit operation
- cl_mem clmem = ((cl_mem)as_cl(srcMemory.owner()));
- kernels_[blitType]->parameters().set(0, sizeof(cl_mem), &clmem);
- clmem = ((cl_mem)as_cl(dstImage.owner()));
- kernels_[blitType]->parameters().set(1, sizeof(cl_mem), &clmem);
-
- const size_t elementSize =
- dstImage.owner()->asImage()->getImageFormat().getElementSize();
- const size_t numChannels =
- dstImage.owner()->asImage()->getImageFormat().getNumChannels();
-
- // 1 element granularity for writes by default
- cl_int granularity = 1;
- if (elementSize == 2) {
- granularity = 2;
- } else if (elementSize >= 4) {
- granularity = 4;
- }
- CondLog(((srcOrigin[0] % granularity) != 0), "Unaligned offset in blit!");
- cl_ulong srcOrg[4] = {srcOrigin[0] / granularity, srcOrigin[1], srcOrigin[2],
- 0};
- kernels_[blitType]->parameters().set(2, sizeof(srcOrg), srcOrg);
-
- cl_int dstOrg[4] = {(cl_int)dstOrigin[0], (cl_int)dstOrigin[1],
- (cl_int)dstOrigin[2], 0};
- cl_int copySize[4] = {(cl_int)size[0], (cl_int)size[1], (cl_int)size[2], 0};
-
- kernels_[blitType]->parameters().set(3, sizeof(dstOrg), dstOrg);
- kernels_[blitType]->parameters().set(4, sizeof(copySize), copySize);
-
- // Program memory format
- uint multiplier = elementSize / sizeof(uint32_t);
- multiplier = (multiplier == 0) ? 1 : multiplier;
- cl_uint format[4] = {(cl_uint)numChannels,
- (cl_uint)(elementSize / numChannels), multiplier, 0};
- kernels_[blitType]->parameters().set(5, sizeof(format), format);
-
- // Program row and slice pitches
- cl_ulong pitch[4] = {0};
- CalcRowSlicePitches(pitch, copySize, rowPitch, slicePitch, dstImage);
- kernels_[blitType]->parameters().set(6, sizeof(pitch), pitch);
-
- // Create ND range object for the kernel's execution
- amd::NDRangeContainer ndrange(dim, globalWorkOffset, globalWorkSize,
- localWorkSize);
-
- // Execute the blit
- address parameters = kernels_[blitType]->parameters().capture(dev_);
- bool result = gpu().submitKernelInternal(ndrange, *kernels_[blitType],
- parameters, NULL);
-
- gpu().releaseGpuMemoryFence();
-
- kernels_[blitType]->parameters().release(const_cast(parameters),
- dev_);
-
- if (useView) {
- dstView->owner()->release();
- }
-
- return result;
-}
-
-bool
-KernelBlitManager::copyImage(
- device::Memory& srcMemory,
- device::Memory& dstMemory,
- const amd::Coord3D& srcOrigin,
- const amd::Coord3D& dstOrigin,
+KernelBlitManager::readBuffer(
+ device::Memory& srcMemory,
+ void* dstHost,
+ const amd::Coord3D& origin,
const amd::Coord3D& size,
bool entire) const
{
- return HsaBlitManager::copyImage(
- srcMemory, dstMemory, srcOrigin, dstOrigin, size, entire);
+ amd::ScopedLock k(lockXferOps_);
+ bool result = false;
+ // Use host copy if memory has direct access
+ if (setup_.disableReadBuffer_ ||
+ (gpuMem(srcMemory).isHostMemDirectAccess())) {
+ result = HostBlitManager::readBuffer(
+ srcMemory, dstHost, origin, size, entire);
+ synchronize();
+ return result;
+ }
+ else {
+ size_t pinSize = size[0];
+ // Check if a pinned transfer can be executed with a single pin
+ if ((pinSize <= dev().settings().pinnedXferSize_) &&
+ (pinSize > MinSizeForPinnedTransfer)) {
+ size_t partial;
+ amd::Memory* amdMemory = pinHostMemory(dstHost, pinSize, partial);
+
+ if (amdMemory == NULL) {
+ // Force SW copy
+ result = HostBlitManager::readBuffer(
+ srcMemory, dstHost, origin, size, entire);
+ synchronize();
+ return result;
+ }
+
+ // Readjust host mem offset
+ amd::Coord3D dstOrigin(partial);
+
+ // Get device memory for this virtual device
+ Memory* dstMemory = dev().getRocMemory(amdMemory);
+
+ // Copy image to buffer
+ result = copyBuffer(srcMemory, *dstMemory,
+ origin, dstOrigin, size, entire);
+
+ // Add pinned memory for a later release
+ gpu().addPinnedMem(amdMemory);
+ }
+ else {
+ result = DmaBlitManager::readBuffer(
+ srcMemory, dstHost, origin, size, entire);
+ }
+ }
+
+ synchronize();
+
+ return result;
+}
+
+bool
+KernelBlitManager::readBufferRect(
+ device::Memory& srcMemory,
+ void* dstHost,
+ const amd::BufferRect& bufRect,
+ const amd::BufferRect& hostRect,
+ const amd::Coord3D& size,
+ bool entire) const
+{
+ amd::ScopedLock k(lockXferOps_);
+ bool result = false;
+
+ // Use host copy if memory has direct access
+ if (setup_.disableReadBufferRect_ || gpuMem(srcMemory).isHostMemDirectAccess()) {
+ result = HostBlitManager::readBufferRect(
+ srcMemory, dstHost, bufRect, hostRect, size, entire);
+ synchronize();
+ return result;
+ }
+ else {
+ size_t pinSize = hostRect.start_ + hostRect.end_;
+ size_t partial;
+ amd::Memory* amdMemory = pinHostMemory(dstHost, pinSize, partial);
+
+ if (amdMemory == NULL) {
+ // Force SW copy
+ result = HostBlitManager::readBufferRect(
+ srcMemory, dstHost, bufRect, hostRect, size, entire);
+ synchronize();
+ return result;
+ }
+
+ // Readjust host mem offset
+ amd::BufferRect rect;
+ rect.rowPitch_ = hostRect.rowPitch_;
+ rect.slicePitch_ = hostRect.slicePitch_;
+ rect.start_ = hostRect.start_ + partial;
+ rect.end_ = hostRect.end_;
+
+ // Get device memory for this virtual device
+ Memory* dstMemory = dev().getRocMemory(amdMemory);
+
+ // Copy image to buffer
+ result = copyBufferRect(srcMemory, *dstMemory,
+ bufRect, rect, size, entire);
+
+ // Add pinned memory for a later release
+ gpu().addPinnedMem(amdMemory);
+ }
+
+ synchronize();
+
+ return result;
+}
+
+bool
+KernelBlitManager::writeBuffer(
+ const void* srcHost,
+ device::Memory& dstMemory,
+ const amd::Coord3D& origin,
+ const amd::Coord3D& size,
+ bool entire) const
+{
+ amd::ScopedLock k(lockXferOps_);
+ bool result = false;
+
+ // Use host copy if memory has direct access
+ if (setup_.disableWriteBuffer_ || gpuMem(dstMemory).isHostMemDirectAccess()) {
+ result = HostBlitManager::writeBuffer(
+ srcHost, dstMemory, origin, size, entire);
+ synchronize();
+ return result;
+ }
+ else {
+ size_t pinSize = size[0];
+
+ // Check if a pinned transfer can be executed with a single pin
+ if ((pinSize <= dev().settings().pinnedXferSize_) &&
+ (pinSize > MinSizeForPinnedTransfer)) {
+ size_t partial;
+ amd::Memory* amdMemory = pinHostMemory(srcHost, pinSize, partial);
+
+ if (amdMemory == NULL) {
+ // Force SW copy
+ result = HostBlitManager::writeBuffer(
+ srcHost, dstMemory, origin, size, entire);
+ synchronize();
+ return result;
+ }
+
+ // Readjust destination offset
+ const amd::Coord3D srcOrigin(partial);
+
+ // Get device memory for this virtual device
+ Memory* srcMemory = dev().getRocMemory(amdMemory);
+
+ // Copy buffer rect
+ result = copyBuffer(*srcMemory, dstMemory,
+ srcOrigin, origin, size, entire);
+
+ // Add pinned memory for a later release
+ gpu().addPinnedMem(amdMemory);
+ }
+ else {
+ result = DmaBlitManager::writeBuffer(
+ srcHost, dstMemory, origin, size, entire);
+ }
+ }
+
+ synchronize();
+
+ return result;
+}
+
+bool
+KernelBlitManager::writeBufferRect(
+ const void* srcHost,
+ device::Memory& dstMemory,
+ const amd::BufferRect& hostRect,
+ const amd::BufferRect& bufRect,
+ const amd::Coord3D& size,
+ bool entire) const
+{
+ amd::ScopedLock k(lockXferOps_);
+ bool result = false;
+
+ // Use host copy if memory has direct access
+ if (setup_.disableWriteBufferRect_ ||
+ gpuMem(dstMemory).isHostMemDirectAccess()) {
+ result = HostBlitManager::writeBufferRect(
+ srcHost, dstMemory, hostRect, bufRect, size, entire);
+ synchronize();
+ return result;
+ }
+ else {
+ size_t pinSize = hostRect.start_ + hostRect.end_;
+ size_t partial;
+ amd::Memory* amdMemory = pinHostMemory(srcHost, pinSize, partial);
+
+ if (amdMemory == NULL) {
+ // Force DMA copy with staging
+ result = DmaBlitManager::writeBufferRect(
+ srcHost, dstMemory, hostRect, bufRect, size, entire);
+ synchronize();
+ return result;
+ }
+
+ // Readjust destination offset
+ const amd::Coord3D srcOrigin(partial);
+
+ // Get device memory for this virtual device
+ Memory* srcMemory = dev().getRocMemory(amdMemory);
+
+ // Readjust host mem offset
+ amd::BufferRect rect;
+ rect.rowPitch_ = hostRect.rowPitch_;
+ rect.slicePitch_ = hostRect.slicePitch_;
+ rect.start_ = hostRect.start_ + partial;
+ rect.end_ = hostRect.end_;
+
+ // Copy buffer rect
+ result = copyBufferRect(*srcMemory, dstMemory,
+ rect, bufRect, size, entire);
+
+ // Add pinned memory for a later release
+ gpu().addPinnedMem(amdMemory);
+ }
+
+ synchronize();
+
+ return result;
}
bool
KernelBlitManager::fillBuffer(
- device::Memory& memory,
+ device::Memory& memory,
const void* pattern,
size_t patternSize,
const amd::Coord3D& origin,
@@ -1411,72 +2027,164 @@ KernelBlitManager::fillBuffer(
bool entire
) const
{
- if (setup_.disableFillBuffer_ || memory.isHostMemDirectAccess()) {
- return HostBlitManager::fillBuffer(memory, pattern, patternSize, origin,
- size, entire);
- }
+ amd::ScopedLock k(lockXferOps_);
+ bool result = false;
- uint fillType = FillBuffer;
- size_t globalWorkOffset[3] = { 0, 0, 0 };
- cl_ulong fillSize = size[0] / patternSize;
- size_t globalWorkSize = amd::alignUp(fillSize, 256);
- size_t localWorkSize = 256;
- bool dwordAligned =
- ((patternSize % sizeof(uint32_t)) == 0) ? true : false;
-
- // Program kernels arguments for the fill operation
- if (dwordAligned) {
- kernels_[fillType]->parameters().set(0, sizeof(cl_mem), NULL);
- cl_mem clmem = ((cl_mem) as_cl(memory.owner()));
- kernels_[fillType]->parameters().set(1, sizeof(cl_mem), &clmem);
+ // Use host fill if memory has direct access
+ if (setup_.disableFillBuffer_ ||
+ gpuMem(memory).isHostMemDirectAccess()) {
+ result = HostBlitManager::fillBuffer(
+ memory, pattern, patternSize, origin, size, entire);
+ synchronize();
+ return result;
}
else {
- cl_mem clmem = ((cl_mem) as_cl(memory.owner()));
- kernels_[fillType]->parameters().set(0, sizeof(cl_mem), &clmem);
- kernels_[fillType]->parameters().set(1, sizeof(cl_mem), NULL);
+ uint fillType = FillBuffer;
+ size_t globalWorkOffset[3] = { 0, 0, 0 };
+ cl_ulong fillSize = size[0] / patternSize;
+ size_t globalWorkSize = amd::alignUp(fillSize, 256);
+ size_t localWorkSize = 256;
+ bool dwordAligned =
+ ((patternSize % sizeof(uint32_t)) == 0) ? true : false;
+
+ // Program kernels arguments for the fill operation
+ cl_mem mem = as_cl(memory.owner());
+ if (dwordAligned) {
+ setArgument(kernels_[fillType], 0, sizeof(cl_mem), NULL);
+ setArgument(kernels_[fillType], 1, sizeof(cl_mem), &mem);
+ }
+ else {
+ setArgument(kernels_[fillType], 0, sizeof(cl_mem), &mem);
+ setArgument(kernels_[fillType], 1, sizeof(cl_mem), NULL);
+ }
+ Memory* gpuCB = dev().getRocMemory(constantBuffer_);
+ if (gpuCB == NULL) {
+ return false;
+ }
+ void* constBuf = gpuCB->getDeviceMemory();
+ memcpy(constBuf, pattern, patternSize);
+
+ mem = as_cl(gpuCB->owner());
+ setArgument(kernels_[fillType], 2, sizeof(cl_mem), &mem);
+ cl_ulong offset = origin[0];
+ if (dwordAligned) {
+ patternSize /= sizeof(uint32_t);
+ offset /= sizeof(uint32_t);
+ }
+ setArgument(kernels_[fillType], 3, sizeof(cl_uint), &patternSize);
+ setArgument(kernels_[fillType], 4, sizeof(offset), &offset);
+ setArgument(kernels_[fillType], 5, sizeof(fillSize), &fillSize);
+
+ // Create ND range object for the kernel's execution
+ amd::NDRangeContainer ndrange(1,
+ globalWorkOffset, &globalWorkSize, &localWorkSize);
+
+ // Execute the blit
+ address parameters = kernels_[fillType]->parameters().capture(dev());
+ result = gpu().submitKernelInternal(ndrange, *kernels_[fillType], parameters, NULL);
+ kernels_[fillType]->parameters().release(const_cast(parameters), dev());
}
- amd::Buffer *fillMemory =
- new (*context_) amd::Buffer(*context_, CL_MEM_USE_HOST_PTR, patternSize);
+ synchronize();
- if (!fillMemory->create(const_cast(pattern))) {
- LogError("[OCL] Fail to create mem object for destination");
- return false;
+ return result;
+}
+
+bool
+KernelBlitManager::copyBuffer(
+ device::Memory& srcMemory,
+ device::Memory& dstMemory,
+ const amd::Coord3D& srcOrigin,
+ const amd::Coord3D& dstOrigin,
+ const amd::Coord3D& sizeIn,
+ bool entire) const
+{
+ amd::ScopedLock k(lockXferOps_);
+ bool result = false;
+
+ if (!gpuMem(srcMemory).isHostMemDirectAccess() &&
+ !gpuMem(dstMemory).isHostMemDirectAccess()) {
+ uint blitType = BlitCopyBuffer;
+ size_t dim = 1;
+ size_t globalWorkOffset[3] = { 0, 0, 0 };
+ size_t globalWorkSize = 0;
+ size_t localWorkSize = 0;
+
+ // todo LC shows much better performance with the unaligned version
+ const static uint CopyBuffAlignment[3] = { 1/*16*/, 1/*4*/, 1 };
+ amd::Coord3D size(sizeIn[0], sizeIn[1], sizeIn[2]);
+
+ bool aligned = false;
+ uint i;
+ for (i = 0; i < sizeof(CopyBuffAlignment) / sizeof(uint); i++) {
+ // Check source alignments
+ aligned = ((srcOrigin[0] % CopyBuffAlignment[i]) == 0);
+ // Check destination alignments
+ aligned &= ((dstOrigin[0] % CopyBuffAlignment[i]) == 0);
+ // Check copy size alignment in the first dimension
+ aligned &= ((sizeIn[0] % CopyBuffAlignment[i]) == 0);
+
+ if (aligned) {
+ if (CopyBuffAlignment[i] != 1) {
+ blitType = BlitCopyBufferAligned;
+ }
+ break;
+ }
+ }
+
+ cl_uint remain;
+ if (blitType == BlitCopyBufferAligned) {
+ size.c[0] /= CopyBuffAlignment[i];
+ }
+ else {
+ remain = size[0] % 4;
+ size.c[0] /= 4;
+ size.c[0] += 1;
+ }
+
+ // Program the dispatch dimensions
+ localWorkSize = 256;
+ globalWorkSize = amd::alignUp(size[0] , 256);
+
+ // Program kernels arguments for the blit operation
+ cl_mem mem = as_cl(srcMemory.owner());
+ setArgument(kernels_[blitType], 0, sizeof(cl_mem), &mem);
+ mem = as_cl(dstMemory.owner());
+ setArgument(kernels_[blitType], 1, sizeof(cl_mem), &mem);
+ // Program source origin
+ cl_ulong srcOffset = srcOrigin[0] / CopyBuffAlignment[i];;
+ setArgument(kernels_[blitType], 2, sizeof(srcOffset), &srcOffset);
+
+ // Program destinaiton origin
+ cl_ulong dstOffset = dstOrigin[0] / CopyBuffAlignment[i];;
+ setArgument(kernels_[blitType], 3, sizeof(dstOffset), &dstOffset);
+
+ cl_ulong copySize = size[0];
+ setArgument(kernels_[blitType], 4, sizeof(copySize), ©Size);
+
+ if (blitType == BlitCopyBufferAligned) {
+ cl_int alignment = CopyBuffAlignment[i];
+ setArgument(kernels_[blitType], 5, sizeof(alignment), &alignment);
+ }
+ else {
+ setArgument(kernels_[blitType], 5, sizeof(remain), &remain);
+ }
+
+ // Create ND range object for the kernel's execution
+ amd::NDRangeContainer ndrange(1,
+ globalWorkOffset, &globalWorkSize, &localWorkSize);
+
+ // Execute the blit
+ address parameters = kernels_[blitType]->parameters().capture(dev());
+ result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], parameters, NULL);
+ kernels_[blitType]->parameters().release(const_cast(parameters), dev());
+ }
+ else {
+ result = DmaBlitManager::copyBuffer(
+ srcMemory, dstMemory, srcOrigin, dstOrigin, sizeIn, entire);
}
- if (fillMemory->getDeviceMemory(dev_) == NULL) {
- LogError("[OCL] Fail to create device mem object for destination");
- return false;
- }
-
- cl_mem clmem = ((cl_mem) as_cl(fillMemory));
- kernels_[fillType]->parameters().set(2, sizeof(cl_mem), &clmem);
- cl_ulong offset = origin[0];
- if (dwordAligned) {
- patternSize /= sizeof(uint32_t);
- offset /= sizeof(uint32_t);
- }
- kernels_[fillType]->parameters().set(3, sizeof(cl_uint), &patternSize);
- kernels_[fillType]->parameters().set(4, sizeof(offset), &offset);
- kernels_[fillType]->parameters().set(5, sizeof(fillSize), &fillSize);
-
- // Create ND range object for the kernel's execution
- amd::NDRangeContainer ndrange(1,
- globalWorkOffset, &globalWorkSize, &localWorkSize);
-
- // Execute the blit
- address parameters = kernels_[fillType]->parameters().capture(dev_);
- bool result = gpu().submitKernelInternal(
- ndrange, *kernels_[fillType], parameters, NULL);
- kernels_[fillType]->parameters().release(const_cast(parameters), dev_);
-
- // Wait for the transfer to finish so that we could safely release the
- // fill memory object.
- // TODO: we could remove this if issue on implicit memory registration is
- // fixed by KFD, so that we could pass the pattern as SVM.
- gpu().releaseGpuMemoryFence();
-
- fillMemory->release();
+ synchronize();
return result;
}
@@ -1490,48 +2198,249 @@ KernelBlitManager::fillImage(
bool entire
) const
{
- return HsaBlitManager::fillImage(memory, pattern, origin, size, entire);
-}
+ amd::ScopedLock k(lockXferOps_);
+ bool result = false;
-bool
-KernelBlitManager::create(amd::Device& device)
-{
- if (!HsaBlitManager::create(device)) {
- return false;
- }
- if (!createProgram(static_cast(device))) {
- return false;
+ // Use host fill if memory has direct access
+ if (setup_.disableFillImage_ ||
+ gpuMem(memory).isHostMemDirectAccess()) {
+ result = HostBlitManager::fillImage(
+ memory, pattern, origin, size, entire);
+ synchronize();
+ return result;
}
- return true;
-}
+ uint fillType;
+ size_t dim = 0;
+ size_t globalWorkOffset[3] = { 0, 0, 0 };
+ size_t globalWorkSize[3];
+ size_t localWorkSize[3];
+ Memory* memView = &gpuMem(memory);
+ amd::Image::Format newFormat(gpuMem(memory).owner()->asImage()->getImageFormat());
-bool
-KernelBlitManager::createProgram(Device& device)
-{
- // Save context and program for this device
- context_ = device.blitProgram()->context_;
- context_->retain();
- program_ = device.blitProgram()->program_;
- program_->retain();
+ // Program the kernels workload depending on the fill dimensions
+ fillType = FillImage;
+ dim = 3;
- bool result = true;
+ void *newpattern = const_cast(pattern);
+ cl_uint4 iFillColor;
- // Create kernel objects for all blits
- for (uint i = 0; i < BlitTotal; ++i) {
- const amd::Symbol* symbol = program_->findSymbol(BlitName[i]);
- if (symbol == NULL) {
- result = false;
- continue;
+ bool rejected = false;
+ bool releaseView = false;
+
+ // For depth, we need to create a view
+ if (newFormat.image_channel_order == CL_sRGBA) {
+ // Find unsupported data type
+ for (uint i = 0; i < RejectedFormatDataTotal; ++i) {
+ if (RejectedData[i].clOldType_ == newFormat.image_channel_data_type) {
+ newFormat.image_channel_data_type = RejectedData[i].clNewType_;
+ rejected = true;
+ break;
+ }
}
- kernels_[i] = new amd::Kernel(*program_, *symbol, BlitName[i]);
- if (kernels_[i] == NULL) {
- result = false;
- continue;
+
+ if (newFormat.image_channel_order == CL_sRGBA) {
+ // Converting a linear RGB floating-point color value to a 8-bit unsigned integer sRGB value because hw is not support write_imagef for sRGB.
+ float *fColor = static_cast(newpattern);
+ iFillColor.s[0] = sRGBmap(fColor[0]);
+ iFillColor.s[1] = sRGBmap(fColor[1]);
+ iFillColor.s[2] = sRGBmap(fColor[2]);
+ iFillColor.s[3] = (cl_uint)(fColor[3]*255.0f);
+ newpattern = static_cast(&iFillColor);
+ for (uint i = 0; i < RejectedFormatChannelTotal; ++i) {
+ if (RejectedOrder[i].clOldType_ == newFormat.image_channel_order) {
+ newFormat.image_channel_order = RejectedOrder[i].clNewType_;
+ rejected = true;
+ break;
+ }
+ }
}
}
+ // If the image format was rejected, then attempt to create a view
+ if (rejected) {
+ memView = createView(gpuMem(memory), newFormat);
+ if (memView != NULL) {
+ rejected = false;
+ releaseView = true;
+ }
+ }
+
+ if (rejected) {
+ return DmaBlitManager::fillImage(memory, pattern, origin, size, entire);
+ }
+
+ // Perform workload split to allow multiple operations in a single thread
+ globalWorkSize[0] = (size[0] + TransferSplitSize - 1) / TransferSplitSize;
+ // Find the current blit type
+ if (memView->owner()->asImage()->getDims() == 1) {
+ globalWorkSize[0] = amd::alignUp(globalWorkSize[0], 256);
+ globalWorkSize[1] = amd::alignUp(size[1], 1);
+ globalWorkSize[2] = amd::alignUp(size[2], 1);
+ localWorkSize[0] = 256;
+ localWorkSize[1] = localWorkSize[2] = 1;
+ }
+ else if (memView->owner()->asImage()->getDims()== 2) {
+ globalWorkSize[0] = amd::alignUp(globalWorkSize[0], 16);
+ globalWorkSize[1] = amd::alignUp(size[1], 16);
+ globalWorkSize[2] = amd::alignUp(size[2], 1);
+ localWorkSize[0] = localWorkSize[1] = 16;
+ localWorkSize[2] = 1;
+ }
+ else {
+ globalWorkSize[0] = amd::alignUp(globalWorkSize[0], 8);
+ globalWorkSize[1] = amd::alignUp(size[1], 8);
+ globalWorkSize[2] = amd::alignUp(size[2], 4);
+ localWorkSize[0] = localWorkSize[1] = 8;
+ localWorkSize[2] = 4;
+ }
+
+ // Program kernels arguments for the blit operation
+ cl_mem mem = as_cl(memView->owner());
+ setArgument(kernels_[fillType], 0, sizeof(cl_mem), &mem);
+ setArgument(kernels_[fillType], 1, sizeof(cl_float4), newpattern);
+ setArgument(kernels_[fillType], 2, sizeof(cl_int4), newpattern);
+ setArgument(kernels_[fillType], 3, sizeof(cl_uint4), newpattern);
+
+ cl_int fillOrigin[4] = { (cl_int)origin[0],
+ (cl_int)origin[1],
+ (cl_int)origin[2], 0 };
+ cl_int fillSize[4] = { (cl_int)size[0],
+ (cl_int)size[1],
+ (cl_int)size[2], 0 };
+ setArgument(kernels_[fillType], 4, sizeof(fillOrigin), fillOrigin);
+ setArgument(kernels_[fillType], 5, sizeof(fillSize), fillSize);
+
+ // Find the type of image
+ uint32_t type = 0;
+ switch (newFormat.image_channel_data_type) {
+ case CL_SNORM_INT8:
+ case CL_SNORM_INT16:
+ case CL_UNORM_INT8:
+ case CL_UNORM_INT16:
+ case CL_UNORM_SHORT_565:
+ case CL_UNORM_SHORT_555:
+ case CL_UNORM_INT_101010:
+ case CL_HALF_FLOAT:
+ case CL_FLOAT:
+ type = 0;
+ break;
+ case CL_SIGNED_INT8:
+ case CL_SIGNED_INT16:
+ case CL_SIGNED_INT32:
+ type = 1;
+ break;
+ case CL_UNSIGNED_INT8:
+ case CL_UNSIGNED_INT16:
+ case CL_UNSIGNED_INT32:
+ type = 2;
+ break;
+ }
+ setArgument(kernels_[fillType], 6, sizeof(type), &type);
+
+ // Create ND range object for the kernel's execution
+ amd::NDRangeContainer ndrange(dim,
+ globalWorkOffset, globalWorkSize, localWorkSize);
+
+ // Execute the blit
+ address parameters = kernels_[fillType]->parameters().capture(dev());
+ result = gpu().submitKernelInternal(ndrange, *kernels_[fillType], parameters, NULL);
+ kernels_[fillType]->parameters().release(const_cast(parameters), dev());
+ if (releaseView) {
+ // todo SRD programming could be changed to avoid a stall
+ gpu().releaseGpuMemoryFence();
+ memView->owner()->release();
+ }
+
+ synchronize();
return result;
}
-} // namespace roc
+amd::Memory*
+DmaBlitManager::pinHostMemory(
+ const void* hostMem,
+ size_t pinSize,
+ size_t& partial) const
+{
+ size_t pinAllocSize;
+ const static bool SysMem = true;
+ amd::Memory* amdMemory;
+
+ // Allign offset to 4K boundary (Vista/Win7 limitation)
+ char* tmpHost = const_cast(
+ amd::alignDown(reinterpret_cast(hostMem),
+ PinnedMemoryAlignment));
+
+ // Find the partial size for unaligned copy
+ partial = reinterpret_cast(hostMem) - tmpHost;
+
+ // Recalculate pin memory size
+ pinAllocSize = amd::alignUp(pinSize + partial, PinnedMemoryAlignment);
+
+ amdMemory = gpu().findPinnedMem(tmpHost, pinAllocSize);
+
+ if (NULL != amdMemory) {
+ return amdMemory;
+ }
+
+ amdMemory = new(*context_)
+ amd::Buffer(*context_, CL_MEM_USE_HOST_PTR, pinAllocSize);
+
+ if ((amdMemory != NULL) && !amdMemory->create(tmpHost, SysMem)) {
+ amdMemory->release();
+ return NULL;
+ }
+
+ // Get device memory for this virtual device
+ // @note: This will force real memory pinning
+ amdMemory->setVirtualDevice(&gpu());
+ Memory* srcMemory = dev().getRocMemory(amdMemory);
+
+ if (srcMemory == NULL) {
+ // Release all pinned memory and attempt pinning again
+ gpu().releasePinnedMem();
+ srcMemory = dev().getRocMemory(amdMemory);
+ if (srcMemory == NULL) {
+ // Release memory
+ amdMemory->release();
+ amdMemory = NULL;
+ }
+ }
+
+ return amdMemory;
+}
+
+Memory*
+KernelBlitManager::createView(
+ const Memory& parent,
+ const cl_image_format format) const
+{
+ assert((parent.owner()->asBuffer() == nullptr) && "View supports images only");
+ amd::Image *image =
+ parent.owner()->asImage()->createView(parent.owner()->getContext(), format, &gpu());
+
+ if (image == NULL) {
+ LogError("[OCL] Fail to allocate view of image object");
+ return NULL;
+ }
+
+ Image* devImage = new roc::Image(dev(), *image);
+ if (devImage == NULL) {
+ LogError("[OCL] Fail to allocate device mem object for the view");
+ image->release();
+ return NULL;
+ }
+
+ if (!devImage->createView(parent)) {
+ LogError("[OCL] Fail to create device mem object for the view");
+ delete devImage;
+ image->release();
+ return NULL;
+ }
+
+ image->replaceDeviceMemory(&dev_, devImage);
+
+ return devImage;
+}
+
+} // namespace pal
diff --git a/projects/clr/rocclr/runtime/device/rocm/rocblit.hpp b/projects/clr/rocclr/runtime/device/rocm/rocblit.hpp
index 16d1ef2363..8891f7170c 100644
--- a/projects/clr/rocclr/runtime/device/rocm/rocblit.hpp
+++ b/projects/clr/rocclr/runtime/device/rocm/rocblit.hpp
@@ -1,5 +1,5 @@
//
-// Copyright (c) 2013 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
//
#pragma once
@@ -8,12 +8,13 @@
#include "platform/commandqueue.hpp"
#include "device/device.hpp"
#include "device/blit.hpp"
+#include "device/rocm/rocdefs.hpp"
-/*! \addtogroup HSA Blit Implementation
+/*! \addtogroup ROC Blit Implementation
* @{
*/
-//! HSA Blit Manager Implementation
+//! ROC Blit Manager Implementation
namespace roc {
class Device;
@@ -22,221 +23,29 @@ class Memory;
class VirtualGPU;
//! DMA Blit Manager
-class HsaBlitManager : public device::HostBlitManager
+class DmaBlitManager : public device::HostBlitManager
{
public:
- //! Constructor
- HsaBlitManager(
- device::VirtualDevice& vdev, //!< Virtual GPU to be used for blits
- Setup setup = Setup() //!< Specifies HW accelerated blits
- );
-
- //! Destructor
- virtual ~HsaBlitManager() {
- if (completion_signal_.handle != 0) {
- hsa_signal_destroy(completion_signal_);
- }
- }
-
- //! Creates HostBlitManager object
- virtual bool create(amd::Device& device) {
- if (HSA_STATUS_SUCCESS != hsa_signal_create(0, 0, NULL, &completion_signal_)) {
- return false;
- }
- return true;
- }
-
- //! Copies a buffer object to system memory
- virtual bool readBuffer(
- device::Memory& srcMemory, //!< Source memory object
- void* dstHost, //!< Destination host memory
- const amd::Coord3D& origin, //!< Source origin
- const amd::Coord3D& size, //!< Size of the copy region
- bool entire = false //!< Entire buffer will be updated
- ) const;
-
- //! Copies a buffer object to system memory
- virtual bool readBufferRect(
- device::Memory& srcMemory, //!< Source memory object
- void* dstHost, //!< Destinaiton host memory
- const amd::BufferRect& bufRect, //!< Source rectangle
- const amd::BufferRect& hostRect, //!< Destination rectangle
- const amd::Coord3D& size, //!< Size of the copy region
- bool entire = false //!< Entire buffer will be updated
- ) const;
-
- //! Copies an image object to system memory
- virtual bool readImage(
- device::Memory& srcMemory, //!< Source memory object
- void* dstHost, //!< Destination host memory
- const amd::Coord3D& origin, //!< Source origin
- const amd::Coord3D& size, //!< Size of the copy region
- size_t rowPitch, //!< Row pitch for host memory
- size_t slicePitch, //!< Slice pitch for host memory
- bool entire = false //!< Entire buffer will be updated
- ) const;
-
- //! Copies system memory to a buffer object
- virtual bool writeBuffer(
- const void* srcHost, //!< Source host memory
- device::Memory& dstMemory, //!< Destination memory object
- const amd::Coord3D& origin, //!< Destination origin
- const amd::Coord3D& size, //!< Size of the copy region
- bool entire = false //!< Entire buffer will be updated
- ) const;
-
- //! Copies system memory to a buffer object
- virtual bool writeBufferRect(
- const void* srcHost, //!< Source host memory
- device::Memory& dstMemory, //!< Destination memory object
- const amd::BufferRect& hostRect, //!< Destination rectangle
- const amd::BufferRect& bufRect, //!< Source rectangle
- const amd::Coord3D& size, //!< Size of the copy region
- bool entire = false //!< Entire buffer will be updated
- ) const;
-
- //! Copies system memory to an image object
- virtual bool writeImage(
- const void* srcHost, //!< Source host memory
- device::Memory& dstMemory, //!< Destination memory object
- const amd::Coord3D& origin, //!< Destination origin
- const amd::Coord3D& size, //!< Size of the copy region
- size_t rowPitch, //!< Row pitch for host memory
- size_t slicePitch, //!< Slice pitch for host memory
- bool entire = false //!< Entire buffer will be updated
- ) const;
-
- //! Copies a buffer object to another buffer object
- virtual bool copyBuffer(
- device::Memory& srcMemory, //!< Source memory object
- device::Memory& dstMemory, //!< Destination memory object
- const amd::Coord3D& srcOrigin, //!< Source origin
- const amd::Coord3D& dstOrigin, //!< Destination origin
- const amd::Coord3D& size, //!< Size of the copy region
- bool entire = false //!< Entire buffer will be updated
- ) const;
-
- //! Copies a buffer object to another buffer object
- virtual bool copyBufferRect(
- device::Memory& srcMemory, //!< Source memory object
- device::Memory& dstMemory, //!< Destination memory object
- const amd::BufferRect& srcRect, //!< Source rectangle
- const amd::BufferRect& dstRect, //!< Destination rectangle
- const amd::Coord3D& size, //!< Size of the copy region
- bool entire = false //!< Entire buffer will be updated
- ) const;
-
- //! Copies an image object to a buffer object
- virtual bool copyImageToBuffer(
- device::Memory& srcMemory, //!< Source memory object
- device::Memory& dstMemory, //!< Destination memory object
- const amd::Coord3D& srcOrigin, //!< Source origin
- const amd::Coord3D& dstOrigin, //!< Destination origin
- const amd::Coord3D& size, //!< Size of the copy region
- bool entire = false, //!< Entire buffer will be updated
- size_t rowPitch = 0, //!< Pitch for buffer
- size_t slicePitch = 0 //!< Slice for buffer
- ) const;
-
- //! Copies a buffer object to an image object
- virtual bool copyBufferToImage(
- device::Memory& srcMemory, //!< Source memory object
- device::Memory& dstMemory, //!< Destination memory object
- const amd::Coord3D& srcOrigin, //!< Source origin
- const amd::Coord3D& dstOrigin, //!< Destination origin
- const amd::Coord3D& size, //!< Size of the copy region
- bool entire = false, //!< Entire buffer will be updated
- size_t rowPitch = 0, //!< Pitch for buffer
- size_t slicePitch = 0 //!< Slice for buffer
- ) const;
-
- //! Copies an image object to another image object
- virtual bool copyImage(
- device::Memory& srcMemory, //!< Source memory object
- device::Memory& dstMemory, //!< Destination memory object
- const amd::Coord3D& srcOrigin, //!< Source origin
- const amd::Coord3D& dstOrigin, //!< Destination origin
- const amd::Coord3D& size, //!< Size of the copy region
- bool entire = false //!< Entire buffer will be updated
- ) const;
-
- //! Fills a buffer memory with a pattern data
- virtual bool fillBuffer(
- device::Memory& memory, //!< Memory object to fill with pattern
- const void* pattern, //!< Pattern data
- size_t patternSize, //!< Pattern size
- const amd::Coord3D& origin, //!< Destination origin
- const amd::Coord3D& size, //!< Size of the copy region
- bool entire = false //!< Entire buffer will be updated
- ) const;
-
- //! Fills an image memory with a pattern data
- virtual bool fillImage(
- device::Memory& dstMemory, //!< Memory object to fill with pattern
- const void* pattern, //!< Pattern data
- const amd::Coord3D& origin, //!< Destination origin
- const amd::Coord3D& size, //!< Size of the copy region
- bool entire = false //!< Entire buffer will be updated
- ) const;
-
-protected:
- //! Returns the virtual GPU object
- VirtualGPU& gpu() const { return static_cast(vDev_); }
-
-private:
- //! Handle of Hsa Device object
- const roc::Device& roc_device_;
-
- hsa_signal_t completion_signal_;
-
- //! Assits in transferring data from Host to Local or vice versa
- //! taking into account the Hsail profile supported by Hsa Agent
- bool hsaCopy(
- const void *hostSrc, //!< Contains source data to be copied
- void *hostDst, //!< Destination buffer address for copying
- uint32_t size, //!< Size of data to copy in bytes
- bool hostToDev //!< True if data is copied from Host To Device
- ) const;
-
- //! Disable copy constructor
- HsaBlitManager(const HsaBlitManager&);
-
- //! Disable operator=
- HsaBlitManager& operator=(const HsaBlitManager&);
-};
-
-//! Kernel Blit Manager
-//class KernelBlitManager : public HsaBlitManager
-class KernelBlitManager : public HsaBlitManager
-{
-private:
- VirtualGPU& gpu() const { return static_cast(vDev_); }
-public:
- enum {
- BlitCopyImage = 0,
- BlitCopyImage1DA,
- BlitCopyImageToBuffer,
- BlitCopyBufferToImage,
- BlitCopyBufferRect,
- BlitCopyBufferRectAligned,
- BlitCopyBuffer,
- BlitCopyBufferAligned,
- FillBuffer,
- FillImage,
- BlitTotal
- };
-
//! Constructor
- KernelBlitManager(
- device::VirtualDevice& vdev, //!< Virtual GPU to be used for blits
- Setup setup = Setup() //!< Specifies HW accelerated blits
+ DmaBlitManager(
+ VirtualGPU& gpu, //!< Virtual GPU to be used for blits
+ Setup setup = Setup() //!< Specifies HW accelerated blits
);
//! Destructor
- virtual ~KernelBlitManager();
+ virtual ~DmaBlitManager() {
+ if (completion_signal_.handle != 0) {
+ hsa_signal_destroy(completion_signal_);
+ }
+ }
- //! Creates HostBlitManager object
- virtual bool create(amd::Device& device);
+ //! Creates DmaBlitManager object
+ virtual bool create(amd::Device& device) {
+ if (HSA_STATUS_SUCCESS != hsa_signal_create(0, 0, NULL, &completion_signal_)) {
+ false;
+ }
+ return true;
+ }
//! Copies a buffer object to system memory
virtual bool readBuffer(
@@ -352,6 +161,230 @@ public:
bool entire = false //!< Entire buffer will be updated
) const;
+protected:
+ const static uint MaxPinnedBuffers = 4;
+
+ //! Synchronizes the blit operations if necessary
+ inline void synchronize() const;
+
+ //! Returns the virtual GPU object
+ VirtualGPU& gpu() const { return static_cast(vDev_); }
+
+ //! Returns the ROC device object
+ const Device& dev() const { return static_cast(dev_); };
+
+ inline Memory& gpuMem(device::Memory& mem) const;
+
+ //! Pins host memory for GPU access
+ amd::Memory* pinHostMemory(
+ const void* hostMem, //!< Host memory pointer
+ size_t pinSize, //!< Host memory size
+ size_t& partial //!< Extra offset for memory alignment
+ ) const;
+
+ //! Assits in transferring data from Host to Local or vice versa
+ //! taking into account the Hsail profile supported by Hsa Agent
+ bool hsaCopy(
+ const Memory& srcMemory,
+ const Memory& dstMemory,
+ const amd::Coord3D& srcOrigin,
+ const amd::Coord3D& dstOrigin,
+ const amd::Coord3D& size,
+ bool enableCopyRect = false,
+ bool flushDMA = true) const;
+
+ const size_t MinSizeForPinnedTransfer;
+ bool completeOperation_; //!< DMA blit manager must complete operation
+ amd::Context* context_; //!< A dummy context
+
+private:
+
+ //! Disable copy constructor
+ DmaBlitManager(const DmaBlitManager&);
+
+ //! Disable operator=
+ DmaBlitManager& operator=(const DmaBlitManager&);
+
+ //! Reads video memory, using a staged buffer
+ bool readMemoryStaged(
+ Memory& srcMemory, //!< Source memory object
+ void* dstHost, //!< Destination host memory
+ Memory& xferBuf, //!< Staged buffer for read
+ size_t origin, //!< Original offset in the source memory
+ size_t& offset, //!< Offset for the current copy pointer
+ size_t& totalSize, //!< Total size for copy region
+ size_t xferSize //!< Transfer size
+ ) const;
+
+ //! Write into video memory, using a staged buffer
+ bool writeMemoryStaged(
+ const void* srcHost, //!< Source host memory
+ Memory& dstMemory, //!< Destination memory object
+ Memory& xferBuf, //!< Staged buffer for write
+ size_t origin, //!< Original offset in the destination memory
+ size_t& offset, //!< Offset for the current copy pointer
+ size_t& totalSize, //!< Total size for the copy region
+ size_t xferSize //!< Transfer size
+ ) const;
+
+ //! Handle of ROC Device object
+ hsa_signal_t completion_signal_;
+
+ //! Assits in transferring data from Host to Local or vice versa
+ //! taking into account the Hsail profile supported by Hsa Agent
+ bool hsaCopyStaged(
+ const_address hostSrc, //!< Contains source data to be copied
+ address hostDst, //!< Destination buffer address for copying
+ size_t size, //!< Size of data to copy in bytes
+ address staging, //!< Staging resource
+ bool hostToDev //!< True if data is copied from Host To Device
+ ) const;
+};
+
+//! Kernel Blit Manager
+class KernelBlitManager : public DmaBlitManager
+{
+public:
+ enum {
+ BlitCopyImage = 0,
+ BlitCopyImage1DA,
+ BlitCopyImageToBuffer,
+ BlitCopyBufferToImage,
+ BlitCopyBufferRect,
+ BlitCopyBufferRectAligned,
+ BlitCopyBuffer,
+ BlitCopyBufferAligned,
+ FillBuffer,
+ FillImage,
+ BlitTotal
+ };
+
+ //! Constructor
+ KernelBlitManager(
+ VirtualGPU& gpu, //!< Virtual GPU to be used for blits
+ Setup setup = Setup() //!< Specifies HW accelerated blits
+ );
+
+ //! Destructor
+ virtual ~KernelBlitManager();
+
+ //! Creates DmaBlitManager object
+ virtual bool create(amd::Device& device);
+
+ //! Copies a buffer object to another buffer object
+ virtual bool copyBufferRect(
+ device::Memory& srcMemory, //!< Source memory object
+ device::Memory& dstMemory, //!< Destination memory object
+ const amd::BufferRect& srcRectIn, //!< Source rectangle
+ const amd::BufferRect& dstRectIn, //!< Destination rectangle
+ const amd::Coord3D& sizeIn, //!< Size of the copy region
+ bool entire = false //!< Entire buffer will be updated
+ ) const;
+
+ //! Copies a buffer object to system memory
+ virtual bool readBuffer(
+ device::Memory& srcMemory, //!< Source memory object
+ void* dstHost, //!< Destination host memory
+ const amd::Coord3D& origin, //!< Source origin
+ const amd::Coord3D& size, //!< Size of the copy region
+ bool entire = false //!< Entire buffer will be updated
+ ) const;
+
+ //! Copies a buffer object to system memory
+ virtual bool readBufferRect(
+ device::Memory& srcMemory, //!< Source memory object
+ void* dstHost, //!< Destinaiton host memory
+ const amd::BufferRect& bufRect, //!< Source rectangle
+ const amd::BufferRect& hostRect, //!< Destination rectangle
+ const amd::Coord3D& size, //!< Size of the copy region
+ bool entire = false //!< Entire buffer will be updated
+ ) const;
+
+ //! Copies system memory to a buffer object
+ virtual bool writeBuffer(
+ const void* srcHost, //!< Source host memory
+ device::Memory& dstMemory, //!< Destination memory object
+ const amd::Coord3D& origin, //!< Destination origin
+ const amd::Coord3D& size, //!< Size of the copy region
+ bool entire = false //!< Entire buffer will be updated
+ ) const;
+
+ //! Copies system memory to a buffer object
+ virtual bool writeBufferRect(
+ const void* srcHost, //!< Source host memory
+ device::Memory& dstMemory, //!< Destination memory object
+ const amd::BufferRect& hostRect, //!< Destination rectangle
+ const amd::BufferRect& bufRect, //!< Source rectangle
+ const amd::Coord3D& size, //!< Size of the copy region
+ bool entire = false //!< Entire buffer will be updated
+ ) const;
+
+ //! Copies a buffer object to an image object
+ virtual bool copyBuffer(
+ device::Memory& srcMemory, //!< Source memory object
+ device::Memory& dstMemory, //!< Destination memory object
+ const amd::Coord3D& srcOrigin, //!< Source origin
+ const amd::Coord3D& dstOrigin, //!< Destination origin
+ const amd::Coord3D& size, //!< Size of the copy region
+ bool entire = false //!< Entire buffer will be updated
+ ) const;
+
+ //! Copies a buffer object to an image object
+ virtual bool copyBufferToImage(
+ device::Memory& srcMemory, //!< Source memory object
+ device::Memory& dstMemory, //!< Destination memory object
+ const amd::Coord3D& srcOrigin, //!< Source origin
+ const amd::Coord3D& dstOrigin, //!< Destination origin
+ const amd::Coord3D& size, //!< Size of the copy region
+ bool entire = false, //!< Entire buffer will be updated
+ size_t rowPitch = 0, //!< Pitch for buffer
+ size_t slicePitch = 0 //!< Slice for buffer
+ ) const;
+
+ //! Copies an image object to a buffer object
+ virtual bool copyImageToBuffer(
+ device::Memory& srcMemory, //!< Source memory object
+ device::Memory& dstMemory, //!< Destination memory object
+ const amd::Coord3D& srcOrigin, //!< Source origin
+ const amd::Coord3D& dstOrigin, //!< Destination origin
+ const amd::Coord3D& size, //!< Size of the copy region
+ bool entire = false, //!< Entire buffer will be updated
+ size_t rowPitch = 0, //!< Pitch for buffer
+ size_t slicePitch = 0 //!< Slice for buffer
+ ) const;
+
+ //! Copies an image object to another image object
+ virtual bool copyImage(
+ device::Memory& srcMemory, //!< Source memory object
+ device::Memory& dstMemory, //!< Destination memory object
+ const amd::Coord3D& srcOrigin, //!< Source origin
+ const amd::Coord3D& dstOrigin, //!< Destination origin
+ const amd::Coord3D& size, //!< Size of the copy region
+ bool entire = false //!< Entire buffer will be updated
+ ) const;
+
+ //! Copies an image object to system memory
+ virtual bool readImage(
+ device::Memory& srcMemory, //!< Source memory object
+ void* dstHost, //!< Destination host memory
+ const amd::Coord3D& origin, //!< Source origin
+ const amd::Coord3D& size, //!< Size of the copy region
+ size_t rowPitch, //!< Row pitch for host memory
+ size_t slicePitch, //!< Slice pitch for host memory
+ bool entire = false //!< Entire buffer will be updated
+ ) const;
+
+ //! Copies system memory to an image object
+ virtual bool writeImage(
+ const void* srcHost, //!< Source host memory
+ device::Memory& dstMemory, //!< Destination memory object
+ const amd::Coord3D& origin, //!< Destination origin
+ const amd::Coord3D& size, //!< Size of the copy region
+ size_t rowPitch, //!< Row pitch for host memory
+ size_t slicePitch, //!< Slice pitch for host memory
+ bool entire = false //!< Entire buffer will be updated
+ ) const;
+
//! Fills a buffer memory with a pattern data
virtual bool fillBuffer(
device::Memory& memory, //!< Memory object to fill with pattern
@@ -372,26 +405,56 @@ public:
) const;
private:
- //! Disable copy constructor
- KernelBlitManager(const KernelBlitManager&);
+ static const size_t MaxXferBuffers = 2;
+ static const uint TransferSplitSize = 1;
- //! Disable operator=
- KernelBlitManager& operator=(const KernelBlitManager&);
+ //! Copies a buffer object to an image object
+ bool copyBufferToImageKernel(
+ device::Memory& srcMemory, //!< Source memory object
+ device::Memory& dstMemory, //!< Destination memory object
+ const amd::Coord3D& srcOrigin, //!< Source origin
+ const amd::Coord3D& dstOrigin, //!< Destination origin
+ const amd::Coord3D& size, //!< Size of the copy region
+ bool entire = false, //!< Entire buffer will be updated
+ size_t rowPitch = 0, //!< Pitch for buffer
+ size_t slicePitch = 0 //!< Slice for buffer
+ ) const;
+
+ //! Copies an image object to a buffer object
+ bool copyImageToBufferKernel(
+ device::Memory& srcMemory, //!< Source memory object
+ device::Memory& dstMemory, //!< Destination memory object
+ const amd::Coord3D& srcOrigin, //!< Source origin
+ const amd::Coord3D& dstOrigin, //!< Destination origin
+ const amd::Coord3D& size, //!< Size of the copy region
+ bool entire = false, //!< Entire buffer will be updated
+ size_t rowPitch = 0, //!< Pitch for buffer
+ size_t slicePitch = 0 //!< Slice for buffer
+ ) const;
//! Creates a program for all blit operations
bool createProgram(
Device& device //!< Device object
);
- amd::Image::Format filterFormat(amd::Image::Format oldFormat) const;
+ //! Creates a view memory object
+ Memory* createView(
+ const Memory& parent, //!< Parent memory object
+ const cl_image_format format //!< The new format for a view
+ ) const;
- device::Memory *createImageView(
- device::Memory &parent,
- amd::Image::Format newFormat) const;
+ //! Disable copy constructor
+ KernelBlitManager(const KernelBlitManager&);
- amd::Context *context_; //!< A dummy context
- amd::Program *program_; //!< GPU program obejct
- amd::Kernel *kernels_[BlitTotal]; //!< GPU kernels for blit
+ //! Disable operator=
+ KernelBlitManager& operator=(const KernelBlitManager&);
+
+ amd::Program* program_; //!< GPU program obejct
+ amd::Kernel* kernels_[BlitTotal]; //!< GPU kernels for blit
+ amd::Memory* constantBuffer_; //!< An internal CB for blits
+ amd::Memory* xferBuffers_[MaxXferBuffers]; //!< Transfer buffers for images
+ size_t xferBufferSize_; //!< Transfer buffer size
+ amd::Monitor* lockXferOps_; //!< Lock transfer operation
};
static const char* BlitName[KernelBlitManager::BlitTotal] = {
@@ -404,9 +467,8 @@ static const char* BlitName[KernelBlitManager::BlitTotal] = {
"copyBuffer",
"copyBufferAligned",
"fillBuffer",
- "fillImage"
+ "fillImage",
};
-/*@}*/
-} // namespace roc
+/*@}*/} // namespace roc
diff --git a/projects/clr/rocclr/runtime/device/rocm/rocdefs.hpp b/projects/clr/rocclr/runtime/device/rocm/rocdefs.hpp
index b08349f1e7..26fb001b20 100644
--- a/projects/clr/rocclr/runtime/device/rocm/rocdefs.hpp
+++ b/projects/clr/rocclr/runtime/device/rocm/rocdefs.hpp
@@ -4,6 +4,9 @@
namespace roc {
+//! Alignment restriciton for the pinned memory
+const static size_t PinnedMemoryAlignment = 4 * Ki;
+
typedef uint HsaDeviceId;
struct AMDDeviceInfo {
diff --git a/projects/clr/rocclr/runtime/device/rocm/rocdevice.cpp b/projects/clr/rocclr/runtime/device/rocm/rocdevice.cpp
index c2a17180d4..c784bf1de0 100644
--- a/projects/clr/rocclr/runtime/device/rocm/rocdevice.cpp
+++ b/projects/clr/rocclr/runtime/device/rocm/rocdevice.cpp
@@ -164,9 +164,7 @@ bool NullDevice::create(const AMDDeviceInfo& deviceInfo) {
settings_ = new Settings();
roc::Settings* hsaSettings = static_cast(settings_);
- if ((hsaSettings == NULL) ||
- // @Todo sramalin Use double precision from constsant
- !hsaSettings->create((true) & 0x1)) {
+ if ((hsaSettings == NULL) || !hsaSettings->create(false)) {
LogError("Error creating settings for NULL HSA device");
return false;
}
@@ -189,6 +187,8 @@ Device::Device(hsa_agent_t bkendDevice)
, alloc_granularity_(0)
, context_(nullptr)
, xferQueue_(nullptr)
+ , xferRead_(nullptr)
+ , xferWrite_(nullptr)
, numOfVgpus_(0)
{
group_segment_.handle = 0;
@@ -208,6 +208,10 @@ Device::~Device()
delete mapCache_;
delete mapCacheOps_;
+ // Destroy temporary buffers for read/write
+ delete xferRead_;
+ delete xferWrite_;
+
// Destroy transfer queue
if (xferQueue_ && xferQueue_->terminate()) {
delete xferQueue_;
@@ -363,6 +367,85 @@ Device::loaderQueryHostAddress(const void* device, const void** host)
: HSA_STATUS_ERROR;
}
+Device::XferBuffers::~XferBuffers()
+{
+ // Destroy temporary buffer for reads
+ for (const auto& buf : freeBuffers_) {
+ delete buf;
+ }
+ freeBuffers_.clear();
+}
+
+bool
+Device::XferBuffers::create()
+{
+ Memory* xferBuf = nullptr;
+ bool result = false;
+
+ // Create a buffer object
+ xferBuf = new Buffer(dev(), bufSize_);
+
+ // Try to allocate memory for the transfer buffer
+ if ((nullptr == xferBuf) || !xferBuf->create()) {
+ delete xferBuf;
+ xferBuf = nullptr;
+ LogError("Couldn't allocate a transfer buffer!");
+ }
+ else {
+ result = true;
+ freeBuffers_.push_back(xferBuf);
+ }
+
+ return result;
+}
+
+Memory&
+Device::XferBuffers::acquire()
+{
+ Memory* xferBuf = nullptr;
+ size_t listSize;
+
+ // Lock the operations with the staged buffer list
+ amd::ScopedLock l(lock_);
+ listSize = freeBuffers_.size();
+
+ // If the list is empty, then attempt to allocate a staged buffer
+ if (listSize == 0) {
+ // Allocate memory
+ xferBuf = new Buffer(dev(), bufSize_);
+
+ // Allocate memory for the transfer buffer
+ if ((nullptr == xferBuf) || !xferBuf->create()) {
+ delete xferBuf;
+ xferBuf = nullptr;
+ LogError("Couldn't allocate a transfer buffer!");
+ }
+ else {
+ ++acquiredCnt_;
+ }
+ }
+
+ if (xferBuf == nullptr) {
+ xferBuf = *(freeBuffers_.begin());
+ freeBuffers_.erase(freeBuffers_.begin());
+ ++acquiredCnt_;
+ }
+
+ return *xferBuf;
+}
+
+void
+Device::XferBuffers::release(VirtualGPU& gpu, Memory& buffer)
+{
+ // Make sure buffer isn't busy on the current VirtualGPU, because
+ // the next aquire can come from different queue
+// buffer.wait(gpu);
+ // Lock the operations with the staged buffer list
+ amd::ScopedLock l(lock_);
+ freeBuffers_.push_back(&buffer);
+ --acquiredCnt_;
+}
+
bool Device::init()
{
#if defined(__linux__)
@@ -550,6 +633,28 @@ Device::create()
// Use just 1 entry by default for the map cache
mapCache_->push_back(NULL);
+ if (settings().stagedXferSize_ != 0) {
+ // Initialize staged write buffers
+ if (settings().stagedXferWrite_) {
+ xferWrite_ = new XferBuffers(*this,
+ amd::alignUp(settings().stagedXferSize_, 4 * Ki));
+ if ((xferWrite_ == nullptr) || !xferWrite_->create()) {
+ LogError("Couldn't allocate transfer buffer objects for read");
+ return false;
+ }
+ }
+
+ // Initialize staged read buffers
+ if (settings().stagedXferRead_) {
+ xferRead_ = new XferBuffers(*this,
+ amd::alignUp(settings().stagedXferSize_, 4 * Ki));
+ if ((xferRead_ == nullptr) || !xferRead_->create()) {
+ LogError("Couldn't allocate transfer buffer objects for write");
+ return false;
+ }
+ }
+ }
+
xferQueue();
return true;
@@ -568,11 +673,17 @@ Device::createProgram(amd::option::Options* options) {
bool
Device::mapHSADeviceToOpenCLDevice(hsa_agent_t dev)
{
+ if (HSA_STATUS_SUCCESS != hsa_agent_get_info(_bkendDevice,
+ HSA_AGENT_INFO_PROFILE,
+ &agent_profile_)) {
+ return false;
+ }
+
// Create HSA settings
settings_ = new Settings();
roc::Settings* hsaSettings = static_cast(settings_);
if ((hsaSettings == NULL) ||
- !hsaSettings->create((true) & 0x1)) {
+ !hsaSettings->create((agent_profile_ == HSA_PROFILE_FULL))) {
return false;
}
@@ -712,12 +823,6 @@ Device::populateOCLDeviceConstants()
::strcpy(info_.boardName_, device_name);
}
- if (HSA_STATUS_SUCCESS != hsa_agent_get_info(_bkendDevice,
- HSA_AGENT_INFO_PROFILE,
- &agent_profile_)) {
- return false;
- }
-
if (HSA_STATUS_SUCCESS !=
hsa_agent_get_info(
_bkendDevice, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT,
@@ -883,7 +988,7 @@ Device::populateOCLDeviceConstants()
if (agent_profile_ == HSA_PROFILE_FULL) { // full-profile = participating in coherent memory,
// base-profile = NUMA based non-coherent memory
- info_.hostUnifiedMemory_ = CL_TRUE;
+ info_.hostUnifiedMemory_ = CL_TRUE;
}
info_.memBaseAddrAlign_ = 8 * (flagIsDefault(MEMOBJ_BASE_ADDR_ALIGN) ?
sizeof(cl_long16) : MEMOBJ_BASE_ADDR_ALIGN);
@@ -1244,6 +1349,13 @@ Device::addMapTarget(amd::Memory* memory) const
return true;
}
+Memory*
+Device::getRocMemory(amd::Memory* mem) const
+{
+ return static_cast(mem->getDeviceMemory(*this));
+}
+
+
device::Memory*
Device::createMemory(amd::Memory &owner) const
{
@@ -1302,9 +1414,9 @@ Device::createMemory(amd::Memory &owner) const
imageView->replaceDeviceMemory(this, devImageView);
result = xferMgr().writeImage(owner.getHostMem(), *devImageView,
- amd::Coord3D(0), imageView->getRegion(),
- imageView->getRowPitch(),
- imageView->getSlicePitch(), true);
+ amd::Coord3D(0, 0, 0), imageView->getRegion(),
+ 0,
+ 0, true);
imageView->release();
}
diff --git a/projects/clr/rocclr/runtime/device/rocm/rocdevice.hpp b/projects/clr/rocclr/runtime/device/rocm/rocdevice.hpp
index cd6b4505dd..d6c5315eeb 100644
--- a/projects/clr/rocclr/runtime/device/rocm/rocdevice.hpp
+++ b/projects/clr/rocclr/runtime/device/rocm/rocdevice.hpp
@@ -219,6 +219,54 @@ private:
//! A HSA device ordinal (physical HSA device)
class Device : public NullDevice {
public:
+ //! Transfer buffers
+ class XferBuffers : public amd::HeapObject
+ {
+ public:
+ static const size_t MaxXferBufListSize = 8;
+
+ //! Default constructor
+ XferBuffers(const Device& device, size_t bufSize)
+ : bufSize_(bufSize)
+ , acquiredCnt_(0)
+ , gpuDevice_(device)
+ {}
+
+ //! Default destructor
+ ~XferBuffers();
+
+ //! Creates the xfer buffers object
+ bool create();
+
+ //! Acquires an instance of the transfer buffers
+ Memory& acquire();
+
+ //! Releases transfer buffer
+ void release(
+ VirtualGPU& gpu, //!< Virual GPU object used with the buffer
+ Memory& buffer //!< Transfer buffer for release
+ );
+
+ //! Returns the buffer's size for transfer
+ size_t bufSize() const { return bufSize_; }
+
+ private:
+ //! Disable copy constructor
+ XferBuffers(const XferBuffers&);
+
+ //! Disable assignment operator
+ XferBuffers& operator=(const XferBuffers&);
+
+ //! Get device object
+ const Device& dev() const { return gpuDevice_; }
+
+ size_t bufSize_; //!< Staged buffer size
+ std::list freeBuffers_; //!< The list of free buffers
+ amd::Atomic acquiredCnt_; //!< The total number of acquired buffers
+ amd::Monitor lock_; //!< Stgaed buffer acquire/release lock
+ const Device& gpuDevice_; //!< GPU device object
+ };
+
//! Initialise the whole HSA device subsystem (CAL init, device enumeration, etc).
static bool init();
static void tearDown();
@@ -354,6 +402,17 @@ public:
//! Adds a map target to the cache
bool addMapTarget(amd::Memory* memory) const;
+ //! Returns transfer buffer object
+ XferBuffers& xferWrite() const { return *xferWrite_; }
+
+ //! Returns transfer buffer object
+ XferBuffers& xferRead() const { return *xferRead_; }
+
+ //! Returns a ROC memory object from AMD memory object
+ roc::Memory* getRocMemory(
+ amd::Memory* mem //!< Pointer to AMD memory object
+ ) const;
+
private:
static hsa_ven_amd_loader_1_00_pfn_t amd_loader_ext_table;
@@ -379,6 +438,9 @@ private:
VirtualGPU* xferQueue() const;
+ XferBuffers* xferRead_; //!< Transfer buffers read
+ XferBuffers* xferWrite_; //!< Transfer buffers write
+
public:
amd::Atomic numOfVgpus_; //!< Virtual gpu unique index
}; // class roc::Device
diff --git a/projects/clr/rocclr/runtime/device/rocm/rocmemory.cpp b/projects/clr/rocclr/runtime/device/rocm/rocmemory.cpp
index bc614e85e4..6a7ce7756f 100644
--- a/projects/clr/rocclr/runtime/device/rocm/rocmemory.cpp
+++ b/projects/clr/rocclr/runtime/device/rocm/rocmemory.cpp
@@ -25,10 +25,18 @@ namespace roc {
/////////////////////////////////roc::Memory//////////////////////////////
Memory::Memory(const roc::Device &dev, amd::Memory &owner)
- : device::Memory(owner),
- dev_(dev),
- deviceMemory_(NULL),
- kind_(MEMORY_KIND_NORMAL)
+ : device::Memory(owner)
+ , dev_(dev)
+ , deviceMemory_(NULL)
+ , kind_(MEMORY_KIND_NORMAL)
+{
+}
+
+Memory::Memory(const roc::Device &dev, size_t size)
+ : device::Memory(size)
+ , dev_(dev)
+ , deviceMemory_(NULL)
+ , kind_(MEMORY_KIND_NORMAL)
{
}
@@ -64,8 +72,8 @@ Memory::allocateMapMemory(size_t allocationSize)
roc::Memory* hsaMapMemory = reinterpret_cast(
mapMemory->getDeviceMemory(dev_));
if (hsaMapMemory == nullptr) {
- mapMemory->release();
- return false;
+ mapMemory->release();
+ return false;
}
}
@@ -191,7 +199,7 @@ bool Memory::createInteropBuffer(GLenum targetType, int miplevel, size_t* metada
return false;
#else
assert(owner()->isInterop() && "Object is not an interop object.");
-
+
mesa_glinterop_export_in in;
mesa_glinterop_export_out out;
@@ -213,7 +221,7 @@ bool Memory::createInteropBuffer(GLenum targetType, int miplevel, size_t* metada
if(!dev_.mesa().Export(in, out))
return false;
-
+
size_t size;
hsa_agent_t agent=dev_.getBackendDevice();
hsa_status_t status=hsa_amd_interop_map_buffer(1, &agent, out.dmabuf_fd, 0, &size, &deviceMemory_, metadata_size, (const void**)metadata);
@@ -242,9 +250,18 @@ Buffer::Buffer(const roc::Device &dev, amd::Memory &owner)
: roc::Memory(dev, owner)
{}
+Buffer::Buffer(const roc::Device &dev, size_t size)
+ : roc::Memory(dev, size)
+{}
+
Buffer::~Buffer()
{
- destroy();
+ if (owner() == nullptr) {
+ dev_.hostFree(deviceMemory_, size());
+ }
+ else {
+ destroy();
+ }
}
void
@@ -288,6 +305,15 @@ Buffer::destroy()
bool
Buffer::create()
{
+ if (owner() == nullptr) {
+ deviceMemory_ = dev_.hostAlloc(size(), 1, false);
+ if (deviceMemory_ != nullptr) {
+ flags_ |= HostMemoryDirectAccess;
+ return true;
+ }
+ return false;
+ }
+
//Interop buffer
if(owner()->isInterop())
return createInteropBuffer(GL_ARRAY_BUFFER, 0, NULL, NULL);
@@ -303,8 +329,7 @@ Buffer::create()
}
const size_t offset = owner()->getOrigin();
- deviceMemory_ =
- static_cast(parentBuffer->getDeviceMemory()) + offset;
+ deviceMemory_ = parentBuffer->getDeviceMemory() + offset;
flags_ |= SubMemoryObject;
flags_ |=
@@ -562,10 +587,10 @@ Image::createInteropImage()
{
auto obj=owner()->getInteropObj()->asGLObject();
assert(obj->getCLGLObjectType()!=CL_GL_OBJECT_BUFFER && "Non-image OpenGL object used with interop image API.");
-
+
const hsa_amd_image_descriptor_t* meta;
size_t size=0;
-
+
GLenum glTarget = obj->getGLTarget();
if (glTarget == GL_TEXTURE_CUBE_MAP) {
glTarget = obj->getCubemapFace();
@@ -593,13 +618,13 @@ Image::createInteropImage()
if (obj->getGLTarget()==GL_TEXTURE_CUBE_MAP)
desc.setFace(obj->getCubemapFace());
-
+
originalDeviceMemory_=deviceMemory_;
hsa_status_t err=hsa_amd_image_create(dev_.getBackendDevice(), &imageDescriptor_, amdImageDesc_, originalDeviceMemory_, permission_, &hsaImageObject_);
if(err!=HSA_STATUS_SUCCESS)
return false;
-
+
BufferGuard.Dismiss();
DescGuard.Dismiss();
return true;
@@ -672,13 +697,13 @@ Image::create()
}
bool
-Image::createView(Memory &parent)
+Image::createView(const Memory &parent)
{
deviceMemory_ = parent.getDeviceMemory();
originalDeviceMemory_ = (parent.owner()->asBuffer() != NULL)
? deviceMemory_
- : static_cast(parent).originalDeviceMemory_;
+ : static_cast(parent).originalDeviceMemory_;
kind_=parent.getKind();
diff --git a/projects/clr/rocclr/runtime/device/rocm/rocmemory.hpp b/projects/clr/rocclr/runtime/device/rocm/rocmemory.hpp
index c2d77f6201..92e945cb13 100644
--- a/projects/clr/rocclr/runtime/device/rocm/rocmemory.hpp
+++ b/projects/clr/rocclr/runtime/device/rocm/rocmemory.hpp
@@ -18,10 +18,12 @@ class Memory : public device::Memory {
Memory(const roc::Device &dev, amd::Memory &owner);
+ Memory(const roc::Device &dev, size_t size);
+
virtual ~Memory();
- // Getter for deviceMemory_.
- void *getDeviceMemory() const { return deviceMemory_; }
+ // Getter for deviceMemory_
+ address getDeviceMemory() const { return reinterpret_cast(deviceMemory_); }
// Gets a pointer to a region of host-visible memory for use as the target
// of an indirect map for a given memory object
@@ -41,7 +43,7 @@ class Memory : public device::Memory {
Unimplemented();
return true;
}
-
+
// Immediate blocking write from device cache to owners's backing store.
// Marks owner as "current" by resetting the last writer to NULL.
virtual void syncHostFromCache(SyncFlags syncFlags = SyncFlags())
@@ -112,6 +114,7 @@ class Memory : public device::Memory {
class Buffer : public roc::Memory {
public:
Buffer(const roc::Device &dev, amd::Memory &owner);
+ Buffer(const roc::Device &dev, size_t size);
virtual ~Buffer();
@@ -143,7 +146,7 @@ public:
virtual bool create();
//! Create an image view
- bool createView(Memory &parent);
+ bool createView(const Memory &parent);
//! Gets a pointer to a region of host-visible memory for use as the target
//! of an indirect map for a given memory object
diff --git a/projects/clr/rocclr/runtime/device/rocm/rocsettings.cpp b/projects/clr/rocclr/runtime/device/rocm/rocsettings.cpp
index d3cb3a0b17..cbdccdc700 100644
--- a/projects/clr/rocclr/runtime/device/rocm/rocsettings.cpp
+++ b/projects/clr/rocclr/runtime/device/rocm/rocsettings.cpp
@@ -53,14 +53,38 @@ Settings::Settings()
enablePartialDispatch_ = (partialDispatch) ? false : true;
partialDispatch_ = (partialDispatch) ? false : true;
commandQueues_ = 100; //!< Field value set to maximum number
- //!< concurrent Virtual GPUs for ROCm backend
+ //!< concurrent Virtual GPUs for ROCm backend
+
+ // Disable image DMA by default (ROCM runtime doesn't support it)
+ imageDMA_ = false;
+
+ stagedXferRead_ = true;
+ stagedXferWrite_ = true;
+ stagedXferSize_ = GPU_STAGING_BUFFER_SIZE * Ki;
+
+ // Initialize transfer buffer size to 1MB by default
+ xferBufSize_ = 1024 * Ki;
+
+ const static size_t MaxPinnedXferSize = 32;
+ pinnedXferSize_ = std::min(GPU_PINNED_XFER_SIZE, MaxPinnedXferSize) * Mi;
+ pinnedMinXferSize_ = std::min(GPU_PINNED_MIN_XFER_SIZE * Ki, pinnedXferSize_);
}
bool
-Settings::create(bool doublePrecision)
+Settings::create(bool fullProfile)
{
customHostAllocator_ = true;
+ if (fullProfile) {
+ pinnedXferSize_ = 0;
+ stagedXferSize_ = 0;
+ xferBufSize_ = 0;
+ }
+ else {
+ pinnedXferSize_ = std::max(pinnedXferSize_, pinnedMinXferSize_);
+ stagedXferSize_ = std::max(stagedXferSize_, pinnedMinXferSize_ + 4 * Ki);
+ }
+
// Enable extensions
enableExtension(ClKhrByteAddressableStore);
enableExtension(ClKhrGlobalInt32BaseAtomics);
@@ -72,21 +96,16 @@ Settings::create(bool doublePrecision)
enableExtension(ClKhr3DImageWrites);
enableExtension(ClAmdMediaOps);
enableExtension(ClAmdMediaOps2);
- if(MesaInterop::Supported())
- enableExtension(ClKhrGlSharing);
-
- // Make sure device supports doubles
- doublePrecision_ &= doublePrecision;
-
- if (doublePrecision_) {
- // Enable KHR double precision extension
- enableExtension(ClKhrFp64);
-#if !defined(WITH_LIGHTNING_COMPILER)
- // Also enable AMD double precision extension?
- enableExtension(ClAmdFp64);
-#endif // !defined(WITH_LIGHTNING_COMPILER)
+ if(MesaInterop::Supported()) {
+ enableExtension(ClKhrGlSharing);
}
+ // Enable KHR double precision extension
+ enableExtension(ClKhrFp64);
+#if !defined(WITH_LIGHTNING_COMPILER)
+ // Also enable AMD double precision extension?
+ enableExtension(ClAmdFp64);
+#endif // !defined(WITH_LIGHTNING_COMPILER)
enableExtension(ClKhrSubGroups);
enableExtension(ClKhrDepthImages);
@@ -109,6 +128,18 @@ Settings::override()
if (!flagIsDefault(GPU_MAX_COMMAND_QUEUES)) {
commandQueues_ = GPU_MAX_COMMAND_QUEUES;
}
+
+ if (!flagIsDefault(GPU_IMAGE_DMA)) {
+ commandQueues_ = GPU_IMAGE_DMA;
+ }
+
+ if (!flagIsDefault(GPU_XFER_BUFFER_SIZE)) {
+ xferBufSize_ = GPU_XFER_BUFFER_SIZE * Ki;
+ }
+
+ if (!flagIsDefault(GPU_PINNED_MIN_XFER_SIZE)) {
+ pinnedMinXferSize_ = std::min(GPU_PINNED_MIN_XFER_SIZE * Ki, pinnedXferSize_);
+ }
}
} // namespace roc
diff --git a/projects/clr/rocclr/runtime/device/rocm/rocsettings.hpp b/projects/clr/rocclr/runtime/device/rocm/rocsettings.hpp
index fc716584ad..4e1f9400f8 100644
--- a/projects/clr/rocclr/runtime/device/rocm/rocsettings.hpp
+++ b/projects/clr/rocclr/runtime/device/rocm/rocsettings.hpp
@@ -26,7 +26,10 @@ public:
uint enableImageHandle_: 1; //!< Use HSAIL image/sampler pointer
uint enableNCMode_: 1; //!< Enable Non Coherent mode for system memory
uint enablePartialDispatch_: 1; //!< Enable support for Partial Dispatch
- uint reserved_: 26;
+ uint imageDMA_: 1; //!< Enable direct image DMA transfers
+ uint stagedXferRead_: 1; //!< Uses a staged buffer read
+ uint stagedXferWrite_: 1; //!< Uses a staged buffer write
+ uint reserved_: 22;
};
uint value_;
};
@@ -46,11 +49,16 @@ public:
uint kernargPoolSize_;
uint signalPoolSize_;
+ size_t xferBufSize_; //!< Transfer buffer size for image copy optimization
+ size_t stagedXferSize_; //!< Staged buffer size
+ size_t pinnedXferSize_; //!< Pinned buffer size for transfer
+ size_t pinnedMinXferSize_; //!< Minimal buffer size for pinned transfer
+
//! Default constructor
Settings();
//! Creates settings
- bool create(bool doublePrecision);
+ bool create(bool fullProfile);
private:
//! Disable copy constructor
diff --git a/projects/clr/rocclr/runtime/device/rocm/rocvirtual.cpp b/projects/clr/rocclr/runtime/device/rocm/rocvirtual.cpp
index 06735ca902..6fe28dea08 100644
--- a/projects/clr/rocclr/runtime/device/rocm/rocvirtual.cpp
+++ b/projects/clr/rocclr/runtime/device/rocm/rocvirtual.cpp
@@ -446,6 +446,9 @@ bool VirtualGPU::releaseGpuMemoryFence() {
hasPendingDispatch_ = false;
+ // Release all transfer buffers on this command queue
+ releaseXferWrite();
+
// Release all memory dependencies
memoryDependency().clear();
@@ -1774,8 +1777,66 @@ void VirtualGPU::submitReleaseExtObjects(amd::ReleaseExtObjectsCommand& vcmd)
profilingEnd(vcmd);
}
-void VirtualGPU::flush(amd::Command *list, bool wait) {
+void VirtualGPU::flush(amd::Command *list, bool wait)
+{
releaseGpuMemoryFence();
updateCommandsState(list);
+ // Rlease all pinned memory
+ releasePinnedMem();
+}
+
+void
+VirtualGPU::addXferWrite(Memory& memory)
+{
+ if (xferWriteBuffers_.size() > 7) {
+ dev().xferWrite().release(*this, *xferWriteBuffers_.front());
+ xferWriteBuffers_.erase(xferWriteBuffers_.begin());
+ }
+
+ // Delay destruction
+ xferWriteBuffers_.push_back(&memory);
+}
+
+void
+VirtualGPU::releaseXferWrite()
+{
+ for (auto& memory : xferWriteBuffers_) {
+ dev().xferWrite().release(*this, *memory);
+ }
+ xferWriteBuffers_.resize(0);
+}
+
+void
+VirtualGPU::addPinnedMem(amd::Memory* mem)
+{
+ if (nullptr == findPinnedMem(mem->getHostMem(), mem->getSize())) {
+ if (pinnedMems_.size() > 7) {
+ pinnedMems_.front()->release();
+ pinnedMems_.erase(pinnedMems_.begin());
+ }
+
+ // Delay destruction
+ pinnedMems_.push_back(mem);
+ }
+}
+
+void
+VirtualGPU::releasePinnedMem()
+{
+ for (auto& amdMemory : pinnedMems_) {
+ amdMemory->release();
+ }
+ pinnedMems_.resize(0);
+}
+
+amd::Memory*
+VirtualGPU::findPinnedMem(void* addr, size_t size)
+{
+ for (auto& amdMemory : pinnedMems_) {
+ if ((amdMemory->getHostMem() == addr) && (size <= amdMemory->getSize())) {
+ return amdMemory;
+ }
+ }
+ return nullptr;
}
} // End of roc namespace
diff --git a/projects/clr/rocclr/runtime/device/rocm/rocvirtual.hpp b/projects/clr/rocclr/runtime/device/rocm/rocvirtual.hpp
index 8ff19db976..bbf29929f6 100644
--- a/projects/clr/rocclr/runtime/device/rocm/rocvirtual.hpp
+++ b/projects/clr/rocclr/runtime/device/rocm/rocvirtual.hpp
@@ -149,7 +149,7 @@ public:
void submitAcquireExtObjects(amd::AcquireExtObjectsCommand& cmd);
void submitReleaseExtObjects(amd::ReleaseExtObjectsCommand& cmd);
void submitPerfCounter(amd::PerfCounterCommand& cmd){};
-
+
void flush(amd::Command* list = NULL, bool wait = false);
void submitFillMemory(amd::FillMemoryCommand& cmd);
void submitMigrateMemObjects(amd::MigrateMemObjectsCommand& cmd);
@@ -193,10 +193,24 @@ public:
bool processMemObjects(
const amd::Kernel& kernel, //!< AMD kernel object for execution
const_address params //!< Pointer to the param's store
- );
+ );
//Retun the virtual gpu unique index
uint index() const { return index_; }
+ //! Adds a stage write buffer into a list
+ void addXferWrite(Memory& memory);
+
+ //! Releases stage write buffers
+ void releaseXferWrite();
+
+ //! Adds a pinned memory object into a map
+ void addPinnedMem(amd::Memory* mem);
+
+ //! Release pinned memory objects
+ void releasePinnedMem();
+
+ //! Finds if pinned memory is cached
+ amd::Memory* findPinnedMem(void* addr, size_t size);
// } roc OpenCL integration
private:
@@ -219,6 +233,9 @@ private:
//! Updates AQL header for the upcomming dispatch
void setAqlHeader(uint16_t header) { aqlHeader_ = header; }
+ std::vector xferWriteBuffers_; //!< Stage write buffers
+ std::vector pinnedMems_; //!< Pinned memory list
+
/**
* @brief Maintains the list of sampler allocated for one or more kernel
* submissions.
@@ -231,16 +248,16 @@ private:
*/
bool hasPendingDispatch_;
Timestamp* timestamp_;
- hsa_agent_t gpu_device_; //!< Physical device
- hsa_queue_t* gpu_queue_; //!< Queue associated with a gpu
+ hsa_agent_t gpu_device_; //!< Physical device
+ hsa_queue_t* gpu_queue_; //!< Queue associated with a gpu
hsa_barrier_and_packet_t barrier_packet_;
hsa_signal_t barrier_signal_;
- uint32_t dispatch_id_; //!< This variable must be updated atomically.
- Device& roc_device_; //!< roc device object
+ uint32_t dispatch_id_; //!< This variable must be updated atomically.
+ Device& roc_device_; //!< roc device object
void * tools_lib_;
PrintfDbg* printfdbg_;
MemoryDependency memoryDependency_; //!< Memory dependency class
- uint16_t aqlHeader_; //!< AQL header for dispatch
+ uint16_t aqlHeader_; //!< AQL header for dispatch
char* kernarg_pool_base_;
size_t kernarg_pool_size_;