5efe63df44
ECR #304775 - Optimization for rectangular copies(Part2). Due to HW restriction of 14bits for src and dst pitch, its advantageous to choose optimal bpp. Higher the bpp the larger the byte pitch. This indirectly helps to reduce the number of packets for buffer copy(line by line vs a single sub_win raw packet) ReviewBoardURL = http://ocltc.amd.com/reviews/r/5605/diff/ Affected files ... ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpublit.cpp#109 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuresource.cpp#191 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuresource.hpp#76 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gslbe/src/rt/GSLContext.cpp#64 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gslbe/src/rt/GSLContext.h#38 edit
2873 строки
97 KiB
C++
2873 строки
97 KiB
C++
//
|
|
// Copyright (c) 2010 Advanced Micro Devices, Inc. All rights reserved.
|
|
//
|
|
|
|
#include "platform/commandqueue.hpp"
|
|
#include "device/gpu/gpudevice.hpp"
|
|
#include "device/gpu/gpublit.hpp"
|
|
#include "device/gpu/gpumemory.hpp"
|
|
#include "device/gpu/gpuvirtual.hpp"
|
|
#include "utils/debug.hpp"
|
|
|
|
namespace gpu {
|
|
|
|
DmaBlitManager::DmaBlitManager(VirtualGPU& gpu, Setup setup)
|
|
: HostBlitManager(gpu, setup)
|
|
, MinSizeForPinnedTransfer(dev().settings().pinnedMinXferSize_)
|
|
, completeOperation_(false)
|
|
{
|
|
}
|
|
|
|
inline void
|
|
DmaBlitManager::synchronize() const
|
|
{
|
|
if (syncOperation_) {
|
|
gpu().waitAllEngines();
|
|
gpu().releaseMemObjects();
|
|
}
|
|
}
|
|
|
|
inline Memory&
|
|
DmaBlitManager::gpuMem(device::Memory& mem) const
|
|
{
|
|
return static_cast<Memory&>(mem);
|
|
}
|
|
|
|
bool
|
|
DmaBlitManager::readMemoryStaged(
|
|
Resource& srcMemory,
|
|
void* dstHost,
|
|
Resource** xferBuf,
|
|
size_t origin,
|
|
size_t& offset,
|
|
size_t& totalSize,
|
|
size_t xferSize) const
|
|
{
|
|
amd::Coord3D dst(0, 0, 0);
|
|
size_t tmpSize;
|
|
uint idxWrite = 0;
|
|
uint idxRead = 0;
|
|
size_t chunkSize;
|
|
static const bool CopyRect = false;
|
|
// Flush DMA for ASYNC copy
|
|
static const bool FlushDMA = true;
|
|
|
|
if (dev().xferRead().bufSize() < 128 * Ki) {
|
|
chunkSize = dev().xferRead().bufSize();
|
|
}
|
|
else {
|
|
chunkSize = std::min(amd::alignUp(xferSize / 4, 256),
|
|
dev().xferRead().bufSize());
|
|
chunkSize = std::max(chunkSize, 128 * Ki);
|
|
}
|
|
|
|
// Find the partial transfer size
|
|
tmpSize = std::min(chunkSize, xferSize);
|
|
|
|
amd::Coord3D srcLast(origin + offset, 0, 0);
|
|
amd::Coord3D copySizeLast(tmpSize, 0, 0);
|
|
|
|
// Copy data into the temporary surface
|
|
if (!srcMemory.partialMemCopyTo(gpu(), srcLast, dst, copySizeLast,
|
|
*xferBuf[idxWrite], CopyRect, FlushDMA)) {
|
|
return false;
|
|
}
|
|
|
|
totalSize -= tmpSize;
|
|
xferSize -= tmpSize;
|
|
offset += tmpSize;
|
|
|
|
while (xferSize != 0) {
|
|
// Find the partial transfer size
|
|
tmpSize = std::min(chunkSize, xferSize);
|
|
|
|
amd::Coord3D src(origin + offset, 0, 0);
|
|
amd::Coord3D copySize(tmpSize, 0, 0);
|
|
|
|
idxWrite = (idxWrite + 1) % 2;
|
|
// Copy data into the temporary surface
|
|
if (!srcMemory.partialMemCopyTo(gpu(), src, dst, copySize,
|
|
*xferBuf[idxWrite], CopyRect, FlushDMA)) {
|
|
return false;
|
|
}
|
|
|
|
// Read previous buffer
|
|
if (!xferBuf[idxRead]->hostRead(&gpu(),
|
|
reinterpret_cast<char*>(dstHost) + offset - copySizeLast[0],
|
|
dst, copySizeLast)) {
|
|
return false;
|
|
}
|
|
idxRead = (idxRead + 1) % 2;
|
|
copySizeLast = copySize;
|
|
|
|
totalSize -= tmpSize;
|
|
xferSize -= tmpSize;
|
|
offset += tmpSize;
|
|
}
|
|
|
|
// Last read
|
|
if (!xferBuf[idxRead]->hostRead(&gpu(),
|
|
reinterpret_cast<char*>(dstHost) + offset - copySizeLast[0], dst, copySizeLast)) {
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
bool
|
|
DmaBlitManager::readBuffer(
|
|
device::Memory& srcMemory,
|
|
void* dstHost,
|
|
const amd::Coord3D& origin,
|
|
const amd::Coord3D& size,
|
|
bool entire) const
|
|
{
|
|
// Use host copy if memory has direct access
|
|
if (setup_.disableReadBuffer_ ||
|
|
(gpuMem(srcMemory).isHostMemDirectAccess() && gpuMem(srcMemory).isCacheable())) {
|
|
return HostBlitManager::readBuffer(
|
|
srcMemory, dstHost, origin, size, entire);
|
|
}
|
|
else {
|
|
size_t srcSize = size[0];
|
|
size_t offset = 0;
|
|
size_t pinSize = dev().settings().pinnedXferSize_;
|
|
pinSize = std::min(pinSize, srcSize);
|
|
|
|
// Check if a pinned transfer can be executed
|
|
if (pinSize && (srcSize > MinSizeForPinnedTransfer)) {
|
|
// Allign offset to 4K boundary (Vista/Win7 limitation)
|
|
char* tmpHost = const_cast<char*>(
|
|
amd::alignDown(reinterpret_cast<const char*>(dstHost),
|
|
PinnedMemoryAlignment));
|
|
|
|
// Find the partial size for unaligned copy
|
|
size_t partial = reinterpret_cast<const char*>(dstHost) - tmpHost;
|
|
|
|
Resource* pin[MaxPinnedBuffers];
|
|
memset(pin, 0, sizeof(Resource*) * MaxPinnedBuffers);
|
|
uint pinIdx = 0;
|
|
bool first = true;
|
|
size_t tmpSize;
|
|
size_t pinAllocSize;
|
|
|
|
// Copy memory, using pinning
|
|
while (srcSize > 0) {
|
|
// If it's the first iterarion, then readjust the copy size
|
|
// to include alignment
|
|
if (first) {
|
|
pinAllocSize = amd::alignUp(pinSize + partial,
|
|
PinnedMemoryAlignment);
|
|
tmpSize = std::min(pinAllocSize - partial, srcSize);
|
|
first = false;
|
|
}
|
|
else {
|
|
tmpSize = std::min(pinSize, srcSize);
|
|
pinAllocSize = amd::alignUp(tmpSize, PinnedMemoryAlignment);
|
|
partial = 0;
|
|
}
|
|
amd::Coord3D dst(partial, 0, 0);
|
|
amd::Coord3D srcPin(origin[0] + offset, 0, 0);
|
|
amd::HostMemoryReference hostMem(tmpHost);
|
|
amd::Coord3D copySizePin(tmpSize, 0, 0);
|
|
|
|
// Allocate a GPU resource for pinning
|
|
pin[pinIdx] = new Resource(
|
|
dev(), pinAllocSize / Heap::ElementSize, Heap::ElementType);
|
|
|
|
if (pin[pinIdx] != NULL) {
|
|
Resource::PinnedParams params;
|
|
params.owner_ = NULL;
|
|
params.gpu_ = &gpu();
|
|
params.hostMemRef_ = &hostMem;
|
|
params.size_ = pinAllocSize;
|
|
|
|
// Create memory object
|
|
if (pin[pinIdx]->create(Resource::Pinned, ¶ms)) {
|
|
if (!gpuMem(srcMemory).partialMemCopyTo(
|
|
gpu(), srcPin, dst, copySizePin, *pin[pinIdx])) {
|
|
LogWarning("DmaBlitManager::readBuffer failed a pinned copy!");
|
|
break;
|
|
}
|
|
}
|
|
else {
|
|
LogWarning("DmaBlitManager::readBuffer failed to pin a resource!");
|
|
break;
|
|
}
|
|
pinIdx = (pinIdx + 1) % MaxPinnedBuffers;
|
|
delete pin[pinIdx];
|
|
pin[pinIdx] = NULL;
|
|
}
|
|
else {
|
|
LogWarning("DmaBlitManager::readBuffer failed to pin a resource!");
|
|
break;
|
|
}
|
|
srcSize -= tmpSize;
|
|
offset += tmpSize;
|
|
tmpHost = reinterpret_cast<char*>(tmpHost) + tmpSize + partial;
|
|
}
|
|
|
|
for (uint idx = 0; idx < MaxPinnedBuffers; ++idx) {
|
|
delete pin[idx];
|
|
}
|
|
}
|
|
|
|
if (0 != srcSize) {
|
|
Resource& xferBuf0 = dev().xferRead().acquire();
|
|
Resource& xferBuf1 = dev().xferRead().acquire();
|
|
Resource* xferBuf[2] = { &xferBuf0, &xferBuf1 };
|
|
|
|
// Read memory using a staged resource
|
|
if (!readMemoryStaged(gpuMem(srcMemory), dstHost, xferBuf, origin[0],
|
|
offset, srcSize, srcSize)) {
|
|
LogError("DmaBlitManager::readBuffer failed!");
|
|
return false;
|
|
}
|
|
|
|
dev().xferRead().release(gpu(), xferBuf1);
|
|
dev().xferRead().release(gpu(), xferBuf0);
|
|
}
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
bool
|
|
DmaBlitManager::readBufferRect(
|
|
device::Memory& srcMemory,
|
|
void* dstHost,
|
|
const amd::BufferRect& bufRect,
|
|
const amd::BufferRect& hostRect,
|
|
const amd::Coord3D& size,
|
|
bool entire) const
|
|
{
|
|
// Use host copy if memory has direct access
|
|
if (setup_.disableReadBufferRect_ ||
|
|
(gpuMem(srcMemory).isHostMemDirectAccess() && gpuMem(srcMemory).isCacheable())) {
|
|
return HostBlitManager::readBufferRect(
|
|
srcMemory, dstHost, bufRect, hostRect, size, entire);
|
|
}
|
|
else {
|
|
Resource& xferBuf = dev().xferRead().acquire();
|
|
|
|
amd::Coord3D dst(0, 0, 0);
|
|
size_t tmpSize = 0;
|
|
size_t bufOffset;
|
|
size_t hostOffset;
|
|
size_t srcSize;
|
|
|
|
for (size_t z = 0; z < size[2]; ++z) {
|
|
for (size_t y = 0; y < size[1]; ++y) {
|
|
srcSize = size[0];
|
|
bufOffset = bufRect.offset(0, y, z);
|
|
hostOffset = hostRect.offset(0, y, z);
|
|
|
|
while (srcSize != 0) {
|
|
// Find the partial transfer size
|
|
tmpSize = std::min(dev().xferRead().bufSize(), srcSize);
|
|
|
|
amd::Coord3D src(bufOffset, 0, 0);
|
|
amd::Coord3D copySize(tmpSize, 0, 0);
|
|
|
|
// Copy data into the temporary surface
|
|
if (!gpuMem(srcMemory).partialMemCopyTo(
|
|
gpu(), src, dst, copySize, xferBuf, true)) {
|
|
LogError("DmaBlitManager::readBufferRect failed!");
|
|
return false;
|
|
}
|
|
|
|
if (!xferBuf.hostRead(&gpu(),
|
|
reinterpret_cast<char*>(dstHost) + hostOffset,
|
|
dst, copySize)) {
|
|
LogError("DmaBlitManager::readBufferRect failed!");
|
|
return false;
|
|
}
|
|
|
|
srcSize -= tmpSize;
|
|
bufOffset += tmpSize;
|
|
hostOffset += tmpSize;
|
|
}
|
|
}
|
|
}
|
|
dev().xferRead().release(gpu(), xferBuf);
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
bool
|
|
DmaBlitManager::readImage(
|
|
device::Memory& srcMemory,
|
|
void* dstHost,
|
|
const amd::Coord3D& origin,
|
|
const amd::Coord3D& size,
|
|
size_t rowPitch,
|
|
size_t slicePitch,
|
|
bool entire) const
|
|
{
|
|
if (setup_.disableReadImage_) {
|
|
return HostBlitManager::readImage(srcMemory, dstHost,
|
|
origin, size, rowPitch, slicePitch, entire);
|
|
}
|
|
else {
|
|
//! @todo Add HW accelerated path
|
|
return HostBlitManager::readImage(srcMemory, dstHost,
|
|
origin, size, rowPitch, slicePitch, entire);
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
bool
|
|
DmaBlitManager::writeMemoryStaged(
|
|
const void* srcHost,
|
|
Resource& dstMemory,
|
|
Resource& xferBuf,
|
|
size_t origin,
|
|
size_t& offset,
|
|
size_t& totalSize,
|
|
size_t xferSize) const
|
|
{
|
|
amd::Coord3D src(0, 0, 0);
|
|
size_t tmpSize;
|
|
size_t chunkSize;
|
|
|
|
if (dev().xferRead().bufSize() < 128 * Ki) {
|
|
chunkSize = dev().xferRead().bufSize();
|
|
}
|
|
else {
|
|
chunkSize = std::min(amd::alignUp(xferSize / 4, 256),
|
|
dev().xferRead().bufSize());
|
|
chunkSize = std::max(chunkSize, 128 * Ki);
|
|
}
|
|
|
|
while (xferSize != 0) {
|
|
// Find the partial transfer size
|
|
tmpSize = std::min(chunkSize, xferSize);
|
|
amd::Coord3D dst(origin + offset, 0, 0);
|
|
amd::Coord3D copySize(tmpSize, 0, 0);
|
|
|
|
// Copy data into the temporary buffer, using CPU
|
|
if (!xferBuf.hostWrite(&gpu(),
|
|
reinterpret_cast<const char*>(srcHost) + offset,
|
|
src, copySize, Resource::Discard)) {
|
|
return false;
|
|
}
|
|
|
|
// Copy data into the original destination memory
|
|
if (!xferBuf.partialMemCopyTo(
|
|
gpu(), src, dst, copySize, dstMemory)) {
|
|
return false;
|
|
}
|
|
|
|
totalSize -= tmpSize;
|
|
offset += tmpSize;
|
|
xferSize -= tmpSize;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
bool
|
|
DmaBlitManager::writeBuffer(
|
|
const void* srcHost,
|
|
device::Memory& dstMemory,
|
|
const amd::Coord3D& origin,
|
|
const amd::Coord3D& size,
|
|
bool entire) const
|
|
{
|
|
// Use host copy if memory has direct access or it's persistent
|
|
if (setup_.disableWriteBuffer_ ||
|
|
gpuMem(dstMemory).isHostMemDirectAccess() ||
|
|
gpuMem(dstMemory).isPersistentDirectMap()) {
|
|
return HostBlitManager::writeBuffer(
|
|
srcHost, dstMemory, origin, size, entire);
|
|
}
|
|
else {
|
|
size_t dstSize = size[0];
|
|
size_t tmpSize = 0;
|
|
size_t offset = 0;
|
|
size_t pinSize = dev().settings().pinnedXferSize_;
|
|
pinSize = std::min(pinSize, dstSize);
|
|
|
|
// Check if a pinned transfer can be executed
|
|
if (pinSize && (dstSize > MinSizeForPinnedTransfer)) {
|
|
// Allign offset to 4K boundary (Vista/Win7 limitation)
|
|
char* tmpHost = const_cast<char*>(
|
|
amd::alignDown(reinterpret_cast<const char*>(srcHost),
|
|
PinnedMemoryAlignment));
|
|
|
|
// Find the partial size for unaligned copy
|
|
size_t partial = reinterpret_cast<const char*>(srcHost) - tmpHost;
|
|
|
|
Resource* pin[MaxPinnedBuffers];
|
|
memset(pin, 0, sizeof(Resource*) * MaxPinnedBuffers);
|
|
uint pinIdx = 0;
|
|
bool first = true;
|
|
size_t tmpSize;
|
|
size_t pinAllocSize;
|
|
|
|
// Copy memory, using pinning
|
|
while (dstSize > 0) {
|
|
// If it's the first iterarion, then readjust the copy size
|
|
// to include alignment
|
|
if (first) {
|
|
pinAllocSize = amd::alignUp(pinSize + partial,
|
|
PinnedMemoryAlignment);
|
|
tmpSize = std::min(pinAllocSize - partial, dstSize);
|
|
first = false;
|
|
}
|
|
else {
|
|
tmpSize = std::min(pinSize, dstSize);
|
|
pinAllocSize = amd::alignUp(tmpSize, PinnedMemoryAlignment);
|
|
partial = 0;
|
|
}
|
|
amd::Coord3D src(partial, 0, 0);
|
|
amd::Coord3D dstPin(origin[0] + offset, 0, 0);
|
|
amd::HostMemoryReference hostMem(tmpHost);
|
|
amd::Coord3D copySizePin(tmpSize, 0, 0);
|
|
|
|
// Allocate a GPU resource for pinning
|
|
pin[pinIdx] = new Resource(
|
|
dev(), pinAllocSize / Heap::ElementSize, Heap::ElementType);
|
|
|
|
if (pin[pinIdx] != NULL) {
|
|
Resource::PinnedParams params;
|
|
params.owner_ = NULL;
|
|
params.gpu_ = &gpu();
|
|
params.hostMemRef_ = &hostMem;
|
|
params.size_ = pinAllocSize;
|
|
|
|
// Create memory object
|
|
if (pin[pinIdx]->create(Resource::Pinned, ¶ms)) {
|
|
if (!pin[pinIdx]->partialMemCopyTo(
|
|
gpu(), src, dstPin, copySizePin, gpuMem(dstMemory))) {
|
|
LogWarning("DmaBlitManager::writeBuffer failed a pinned copy!");
|
|
break;
|
|
}
|
|
}
|
|
else {
|
|
LogWarning("DmaBlitManager::writeBuffer failed to pin a resource!");
|
|
break;
|
|
}
|
|
pinIdx = (pinIdx + 1) % MaxPinnedBuffers;
|
|
delete pin[pinIdx];
|
|
pin[pinIdx] = NULL;
|
|
}
|
|
else {
|
|
LogWarning("DmaBlitManager::writeBuffer failed to pin a resource!");
|
|
break;
|
|
}
|
|
dstSize -= tmpSize;
|
|
offset += tmpSize;
|
|
tmpHost = reinterpret_cast<char*>(tmpHost) + tmpSize + partial;
|
|
}
|
|
|
|
for (uint idx = 0; idx < MaxPinnedBuffers; ++idx) {
|
|
delete pin[idx];
|
|
}
|
|
}
|
|
|
|
if (dstSize != 0) {
|
|
Resource& xferBuf = dev().xferWrite().acquire();
|
|
|
|
// Write memory using a staged resource
|
|
if (!writeMemoryStaged(srcHost, gpuMem(dstMemory), xferBuf, origin[0],
|
|
offset, dstSize, dstSize)) {
|
|
LogError("DmaBlitManager::writeBuffer failed!");
|
|
return false;
|
|
}
|
|
|
|
gpu().addXferWrite(xferBuf);
|
|
}
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
bool
|
|
DmaBlitManager::writeBufferRect(
|
|
const void* srcHost,
|
|
device::Memory& dstMemory,
|
|
const amd::BufferRect& hostRect,
|
|
const amd::BufferRect& bufRect,
|
|
const amd::Coord3D& size,
|
|
bool entire) const
|
|
{
|
|
// Use host copy if memory has direct access or it's persistent
|
|
if (setup_.disableWriteBufferRect_ ||
|
|
dstMemory.isHostMemDirectAccess() ||
|
|
gpuMem(dstMemory).isPersistentDirectMap()) {
|
|
return HostBlitManager::writeBufferRect(
|
|
srcHost, dstMemory, hostRect, bufRect, size, entire);
|
|
}
|
|
else {
|
|
Resource& xferBuf = dev().xferWrite().acquire();
|
|
|
|
amd::Coord3D src(0, 0, 0);
|
|
size_t tmpSize = 0;
|
|
size_t bufOffset;
|
|
size_t hostOffset;
|
|
size_t dstSize;
|
|
|
|
for (size_t z = 0; z < size[2]; ++z) {
|
|
for (size_t y = 0; y < size[1]; ++y) {
|
|
dstSize = size[0];
|
|
bufOffset = bufRect.offset(0, y, z);
|
|
hostOffset = hostRect.offset(0, y, z);
|
|
|
|
while (dstSize != 0) {
|
|
// Find the partial transfer size
|
|
tmpSize = std::min(dev().xferWrite().bufSize(), dstSize);
|
|
|
|
amd::Coord3D dst(bufOffset, 0, 0);
|
|
amd::Coord3D copySize(tmpSize, 0, 0);
|
|
|
|
// Copy data into the temporary buffer, using CPU
|
|
if (!xferBuf.hostWrite(&gpu(),
|
|
reinterpret_cast<const char*>(srcHost) + hostOffset,
|
|
src, copySize, Resource::Discard)) {
|
|
LogError("DmaBlitManager::writeBufferRect failed!");
|
|
return false;
|
|
}
|
|
|
|
// Copy data into the original destination memory
|
|
if (!xferBuf.partialMemCopyTo(
|
|
gpu(), src, dst, copySize, gpuMem(dstMemory))) {
|
|
LogError("DmaBlitManager::writeBufferRect failed!");
|
|
return false;
|
|
}
|
|
|
|
dstSize -= tmpSize;
|
|
bufOffset += tmpSize;
|
|
hostOffset += tmpSize;
|
|
}
|
|
}
|
|
}
|
|
gpu().addXferWrite(xferBuf);
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
bool
|
|
DmaBlitManager::writeImage(
|
|
const void* srcHost,
|
|
device::Memory& dstMemory,
|
|
const amd::Coord3D& origin,
|
|
const amd::Coord3D& size,
|
|
size_t rowPitch,
|
|
size_t slicePitch,
|
|
bool entire) const
|
|
{
|
|
if (setup_.disableWriteImage_) {
|
|
return HostBlitManager::writeImage(
|
|
srcHost, dstMemory, origin, size, rowPitch, slicePitch, entire);
|
|
}
|
|
else {
|
|
//! @todo Add HW accelerated path
|
|
return HostBlitManager::writeImage(
|
|
srcHost, dstMemory, origin, size, rowPitch, slicePitch, entire);
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
bool
|
|
DmaBlitManager::copyBuffer(
|
|
device::Memory& srcMemory,
|
|
device::Memory& dstMemory,
|
|
const amd::Coord3D& srcOrigin,
|
|
const amd::Coord3D& dstOrigin,
|
|
const amd::Coord3D& size,
|
|
bool entire) const
|
|
{
|
|
if (setup_.disableCopyBuffer_ ||
|
|
(gpuMem(srcMemory).isHostMemDirectAccess() && gpuMem(srcMemory).isCacheable() &&
|
|
gpuMem(dstMemory).isHostMemDirectAccess())) {
|
|
return HostBlitManager::copyBuffer(
|
|
srcMemory, dstMemory, srcOrigin, dstOrigin, size);
|
|
}
|
|
else {
|
|
return gpuMem(srcMemory).partialMemCopyTo(gpu(),
|
|
srcOrigin, dstOrigin, size, gpuMem(dstMemory));
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
bool
|
|
DmaBlitManager::copyBufferRect(
|
|
device::Memory& srcMemory,
|
|
device::Memory& dstMemory,
|
|
const amd::BufferRect& srcRect,
|
|
const amd::BufferRect& dstRect,
|
|
const amd::Coord3D& size,
|
|
bool entire) const
|
|
{
|
|
if (setup_.disableCopyBufferRect_ ||
|
|
(gpuMem(srcMemory).isHostMemDirectAccess() && gpuMem(srcMemory).isCacheable() &&
|
|
gpuMem(dstMemory).isHostMemDirectAccess())) {
|
|
return HostBlitManager::copyBufferRect(
|
|
srcMemory, dstMemory, srcRect, dstRect, size, entire);
|
|
}
|
|
else {
|
|
size_t srcOffset;
|
|
size_t dstOffset;
|
|
|
|
uint bytesPerElement = 16;
|
|
bool optimalElementSize = false;
|
|
bool subWindowRectCopy = dev().settings().rectLinearDMA_;
|
|
|
|
srcOffset = srcRect.offset(0, 0, 0);
|
|
dstOffset = dstRect.offset(0, 0, 0);
|
|
|
|
while (bytesPerElement >= 1) {
|
|
if (((srcOffset % 4) == 0) &&
|
|
((dstOffset % 4) == 0) &&
|
|
((size[0] % bytesPerElement) == 0) &&
|
|
((srcRect.rowPitch_ % bytesPerElement) == 0) &&
|
|
((srcRect.slicePitch_ % bytesPerElement) == 0) &&
|
|
((dstRect.rowPitch_ % bytesPerElement) == 0) &&
|
|
((dstRect.slicePitch_ % bytesPerElement) == 0)) {
|
|
optimalElementSize = true;
|
|
break;
|
|
}
|
|
bytesPerElement = bytesPerElement >> 1;
|
|
}
|
|
|
|
// 19 bit limit in HW in SI and 16 bit limit in CI+(we adjust the ElementSize to 4bytes but the packet still has 14bits)
|
|
size_t pitchLimit = dev().settings().ciPlus_ ? (0x3FFF * bytesPerElement) | 0xF : 0x7FFFF;
|
|
size_t sizeLimit = dev().settings().ciPlus_ ? (0x3FFF * bytesPerElement) | 0xF : 0x3FFF;
|
|
|
|
if (subWindowRectCopy &&
|
|
(!optimalElementSize ||
|
|
(srcRect.rowPitch_ > pitchLimit) ||
|
|
(dstRect.rowPitch_ > pitchLimit) ||
|
|
(size[0] > sizeLimit) || // See above
|
|
(size[1] > 0x3fff) || // 14 bits limit in HW
|
|
(size[2] > 0x7ff))) { // 11 bits limit in HW
|
|
// Restriction with rectLinearDRMDMA packet
|
|
subWindowRectCopy = false;
|
|
}
|
|
|
|
if (subWindowRectCopy) {
|
|
// Copy data with subwindow copy packet
|
|
if (!gpuMem(srcMemory).partialMemCopyTo(gpu(),
|
|
amd::Coord3D(srcOffset, srcRect.rowPitch_, srcRect.slicePitch_),
|
|
amd::Coord3D(dstOffset, dstRect.rowPitch_, dstRect.slicePitch_),
|
|
size, gpuMem(dstMemory), true, false, bytesPerElement)) {
|
|
LogError("copyBufferRect failed!");
|
|
return false;
|
|
}
|
|
}
|
|
else {
|
|
for (size_t z = 0; z < size[2]; ++z) {
|
|
for (size_t y = 0; y < size[1]; ++y) {
|
|
srcOffset = srcRect.offset(0, y, z);
|
|
dstOffset = dstRect.offset(0, y, z);
|
|
|
|
amd::Coord3D src(srcOffset, 0, 0);
|
|
amd::Coord3D dst(dstOffset, 0, 0);
|
|
amd::Coord3D copySize(size[0], 0, 0);
|
|
|
|
// Copy data
|
|
if (!gpuMem(srcMemory).partialMemCopyTo(
|
|
gpu(), src, dst, copySize, gpuMem(dstMemory))) {
|
|
LogError("copyBufferRect failed!");
|
|
return false;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
bool
|
|
DmaBlitManager::copyImageToBuffer(
|
|
device::Memory& srcMemory,
|
|
device::Memory& dstMemory,
|
|
const amd::Coord3D& srcOrigin,
|
|
const amd::Coord3D& dstOrigin,
|
|
const amd::Coord3D& size,
|
|
bool entire,
|
|
size_t rowPitch,
|
|
size_t slicePitch) const
|
|
{
|
|
bool result = false;
|
|
|
|
if (setup_.disableCopyImageToBuffer_) {
|
|
result = HostBlitManager::copyImageToBuffer(srcMemory, dstMemory,
|
|
srcOrigin, dstOrigin, size, entire, rowPitch, slicePitch);
|
|
}
|
|
else {
|
|
// Use CAL path for a transfer
|
|
result = gpuMem(srcMemory).partialMemCopyTo(
|
|
gpu(), srcOrigin, dstOrigin, size, gpuMem(dstMemory));
|
|
|
|
// Check if a HostBlit transfer is required
|
|
if (completeOperation_ && !result) {
|
|
result = HostBlitManager::copyImageToBuffer(srcMemory,
|
|
dstMemory, srcOrigin, dstOrigin, size, entire, rowPitch, slicePitch);
|
|
}
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
bool
|
|
DmaBlitManager::copyBufferToImage(
|
|
device::Memory& srcMemory,
|
|
device::Memory& dstMemory,
|
|
const amd::Coord3D& srcOrigin,
|
|
const amd::Coord3D& dstOrigin,
|
|
const amd::Coord3D& size,
|
|
bool entire,
|
|
size_t rowPitch,
|
|
size_t slicePitch) const
|
|
{
|
|
bool result = false;
|
|
|
|
if (setup_.disableCopyBufferToImage_) {
|
|
result = HostBlitManager::copyBufferToImage(srcMemory,
|
|
dstMemory, srcOrigin, dstOrigin, size, entire, rowPitch, slicePitch);
|
|
}
|
|
else {
|
|
// Use CAL path for a transfer
|
|
result = gpuMem(srcMemory).partialMemCopyTo(
|
|
gpu(), srcOrigin, dstOrigin, size, gpuMem(dstMemory));
|
|
|
|
// Check if a HostBlit transfer is required
|
|
if (completeOperation_ && !result) {
|
|
result = HostBlitManager::copyBufferToImage(srcMemory,
|
|
dstMemory, srcOrigin, dstOrigin, size, entire, rowPitch, slicePitch);
|
|
}
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
bool
|
|
DmaBlitManager::copyImage(
|
|
device::Memory& srcMemory,
|
|
device::Memory& dstMemory,
|
|
const amd::Coord3D& srcOrigin,
|
|
const amd::Coord3D& dstOrigin,
|
|
const amd::Coord3D& size,
|
|
bool entire) const
|
|
{
|
|
bool result = false;
|
|
|
|
if (setup_.disableCopyImage_) {
|
|
return HostBlitManager::copyImage(srcMemory, dstMemory,
|
|
srcOrigin, dstOrigin, size, entire);
|
|
}
|
|
else {
|
|
//! @todo Add HW accelerated path
|
|
return HostBlitManager::copyImage(srcMemory, dstMemory,
|
|
srcOrigin, dstOrigin, size, entire);
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
KernelBlitManager::KernelBlitManager(
|
|
VirtualGPU& gpu, Setup setup)
|
|
: DmaBlitManager(gpu, setup)
|
|
, program_(NULL)
|
|
, context_(NULL)
|
|
, constantBuffer_(NULL)
|
|
, xferBufferSize_(0)
|
|
, lockXferOps_(NULL)
|
|
{
|
|
for (uint i = 0; i < BlitTotal; ++i) {
|
|
kernels_[i] = NULL;
|
|
}
|
|
|
|
for (uint i = 0; i < MaxXferBuffers; ++i) {
|
|
xferBuffers_[i] = NULL;
|
|
}
|
|
|
|
completeOperation_ = false;
|
|
}
|
|
|
|
KernelBlitManager::~KernelBlitManager()
|
|
{
|
|
for (uint i = 0; i < BlitTotal; ++i) {
|
|
if (NULL != kernels_[i]) {
|
|
kernels_[i]->release();
|
|
}
|
|
}
|
|
if (NULL != program_) {
|
|
program_->release();
|
|
}
|
|
|
|
if (NULL != context_) {
|
|
// Release a dummy context
|
|
context_->release();
|
|
}
|
|
|
|
if (NULL != constantBuffer_) {
|
|
constantBuffer_->release();
|
|
}
|
|
|
|
for (uint i = 0; i < MaxXferBuffers; ++i) {
|
|
if (NULL != xferBuffers_[i]) {
|
|
xferBuffers_[i]->release();
|
|
}
|
|
}
|
|
|
|
delete lockXferOps_;
|
|
}
|
|
|
|
bool
|
|
KernelBlitManager::create(amd::Device& device)
|
|
{
|
|
if (!createProgram(static_cast<Device&>(device))) {
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
bool
|
|
KernelBlitManager::createProgram(Device& device)
|
|
{
|
|
std::vector<amd::Device*> devices;
|
|
devices.push_back(&device);
|
|
|
|
// Save context and program for this device
|
|
context_ = device.blitProgram()->context_;
|
|
context_->retain();
|
|
program_ = device.blitProgram()->program_;
|
|
program_->retain();
|
|
|
|
bool result = false;
|
|
do {
|
|
// Create kernel objects for all blits
|
|
for (uint i = 0; i < BlitTotal; ++i) {
|
|
const amd::Symbol* symbol = program_->findSymbol(BlitName[i]);
|
|
if (symbol == NULL) {
|
|
break;
|
|
}
|
|
kernels_[i] = new amd::Kernel(*program_, *symbol, BlitName[i]);
|
|
if (kernels_[i] == NULL) {
|
|
break;
|
|
}
|
|
// Validate blit kernels for the scratch memory usage (pre SI)
|
|
if (!device.validateKernel(*kernels_[i], &gpu())) {
|
|
break;
|
|
}
|
|
}
|
|
|
|
result = true;
|
|
} while(!result);
|
|
|
|
// Create an internal constant buffer
|
|
constantBuffer_ = new (*context_)
|
|
amd::Buffer(*context_, CL_MEM_ALLOC_HOST_PTR, 4 * Ki);
|
|
|
|
if ((constantBuffer_ != NULL) && !constantBuffer_->create(NULL)) {
|
|
constantBuffer_->release();
|
|
constantBuffer_ = NULL;
|
|
return false;
|
|
}
|
|
else if (constantBuffer_ == NULL) {
|
|
return false;
|
|
}
|
|
|
|
// Assign the constant buffer to the current virtual GPU
|
|
constantBuffer_->setVirtualDevice(&gpu());
|
|
|
|
if (dev().settings().xferBufSize_ > 0) {
|
|
xferBufferSize_ = dev().settings().xferBufSize_;
|
|
for (uint i = 0; i < MaxXferBuffers; ++i) {
|
|
// Create internal xfer buffers for image copy optimization
|
|
xferBuffers_[i] = new (*context_)
|
|
amd::Buffer(*context_, 0, xferBufferSize_);
|
|
|
|
if ((xferBuffers_[i] != NULL) && !xferBuffers_[i]->create(NULL)) {
|
|
xferBuffers_[i]->release();
|
|
xferBuffers_[i] = NULL;
|
|
return false;
|
|
}
|
|
else if (xferBuffers_[i] == NULL) {
|
|
return false;
|
|
}
|
|
|
|
// Assign the xfer buffer to the current virtual GPU
|
|
xferBuffers_[i]->setVirtualDevice(&gpu());
|
|
//! @note Workaround for conformance allocation test.
|
|
//! Force GPU mem alloc.
|
|
//! Unaligned images require xfer optimization,
|
|
//! but deferred memory allocation can cause
|
|
//! virtual heap fragmentation for big allocations and
|
|
//! then fail the following test with 32 bit ISA, because
|
|
//! runtime runs out of 4GB space.
|
|
dev().getGpuMemory(xferBuffers_[i]);
|
|
}
|
|
}
|
|
|
|
lockXferOps_ = new amd::Monitor("Transfer Ops Lock", true);
|
|
if (NULL == lockXferOps_) {
|
|
return false;
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
// The following data structures will be used for the view creations.
|
|
// Some formats has to be converted before a kernel blit operation
|
|
struct FormatConvertion {
|
|
cl_uint clOldType_;
|
|
cl_uint clNewType_;
|
|
};
|
|
|
|
// The list of rejected data formats and corresponding conversion
|
|
static const FormatConvertion RejectedData[] =
|
|
{
|
|
{ CL_UNORM_INT8, CL_UNSIGNED_INT8 },
|
|
{ CL_UNORM_INT16, CL_UNSIGNED_INT16 },
|
|
{ CL_SNORM_INT8, CL_UNSIGNED_INT8 },
|
|
{ CL_SNORM_INT16, CL_UNSIGNED_INT16 },
|
|
{ CL_HALF_FLOAT, CL_UNSIGNED_INT16 },
|
|
{ CL_FLOAT, CL_UNSIGNED_INT32 },
|
|
{ CL_SIGNED_INT8, CL_UNSIGNED_INT8 },
|
|
{ CL_SIGNED_INT16, CL_UNSIGNED_INT16 },
|
|
{ CL_UNORM_INT_101010, CL_UNSIGNED_INT8 },
|
|
{ CL_SIGNED_INT32, CL_UNSIGNED_INT32 }
|
|
};
|
|
|
|
// The list of rejected channel's order and corresponding conversion
|
|
static const FormatConvertion RejectedOrder[] =
|
|
{
|
|
{ CL_A, CL_R },
|
|
{ CL_RA, CL_RG },
|
|
{ CL_LUMINANCE, CL_R },
|
|
{ CL_INTENSITY, CL_R },
|
|
{ CL_RGB, CL_RGBA },
|
|
{ CL_BGRA, CL_RGBA },
|
|
{ CL_ARGB, CL_RGBA },
|
|
{ CL_sRGB, CL_RGBA },
|
|
{ CL_sRGBx, CL_RGBA },
|
|
{ CL_sRGBA, CL_RGBA },
|
|
{ CL_sBGRA, CL_RGBA }
|
|
};
|
|
|
|
const uint RejectedFormatDataTotal =
|
|
sizeof(RejectedData) / sizeof(FormatConvertion);
|
|
const uint RejectedFormatChannelTotal =
|
|
sizeof(RejectedOrder) / sizeof(FormatConvertion);
|
|
|
|
bool
|
|
KernelBlitManager::copyBufferToImage(
|
|
device::Memory& srcMemory,
|
|
device::Memory& dstMemory,
|
|
const amd::Coord3D& srcOrigin,
|
|
const amd::Coord3D& dstOrigin,
|
|
const amd::Coord3D& size,
|
|
bool entire,
|
|
size_t rowPitch,
|
|
size_t slicePitch) const
|
|
{
|
|
amd::ScopedLock k(lockXferOps_);
|
|
bool result = false;
|
|
static const bool CopyRect = false;
|
|
// Flush DMA for ASYNC copy
|
|
static const bool FlushDMA = true;
|
|
|
|
if (setup_.disableCopyBufferToImage_) {
|
|
result = DmaBlitManager::copyBufferToImage(
|
|
srcMemory, dstMemory, srcOrigin, dstOrigin, size,
|
|
entire, rowPitch, slicePitch);
|
|
synchronize();
|
|
return result;
|
|
}
|
|
// Check if buffer is in system memory with direct access
|
|
else if (gpuMem(srcMemory).isHostMemDirectAccess() &&
|
|
(rowPitch == 0) && (slicePitch == 0)) {
|
|
// First attempt to do this all with DMA,
|
|
// but there are restriciton with older hardware
|
|
if (dev().settings().imageDMA_) {
|
|
result = DmaBlitManager::copyBufferToImage(
|
|
srcMemory, dstMemory, srcOrigin, dstOrigin, size,
|
|
entire, rowPitch, slicePitch);
|
|
if (result) {
|
|
synchronize();
|
|
return result;
|
|
}
|
|
}
|
|
|
|
if (!setup_.disableCopyBufferToImageOpt_) {
|
|
// Find the overall copy size
|
|
size_t copySize = size[0] * size[1] * size[2] * gpuMem(dstMemory).elementSize();
|
|
|
|
// Check if double copy was requested
|
|
if (xferBufferSize_ != 0) {
|
|
amd::Coord3D src(srcOrigin);
|
|
amd::Coord3D xferSrc(0, 0, 0);
|
|
amd::Coord3D dst(dstOrigin);
|
|
amd::Coord3D xferRect(size);
|
|
// Find transfer size in pixels
|
|
size_t xferSizePix = xferBufferSize_ / gpuMem(dstMemory).elementSize();
|
|
bool transfer = true;
|
|
|
|
// Find transfer rectangle
|
|
if (xferRect[0] > xferSizePix) {
|
|
// The algorithm can't break a line.
|
|
// It requires multiple rectangles tracking
|
|
transfer = false;
|
|
}
|
|
else {
|
|
xferRect.c[1] = xferSizePix / xferRect[0];
|
|
}
|
|
// Check if we exceeded the original size boundary in Y
|
|
if (xferRect[1] > size[1]) {
|
|
xferRect.c[1] = size[1];
|
|
xferRect.c[2] = xferSizePix / (xferRect[0] * xferRect[1]);
|
|
}
|
|
else {
|
|
xferRect.c[2] = 1;
|
|
}
|
|
// Check if we exceeded the original size boundary in Z
|
|
if (xferRect[2] > size[2]) {
|
|
xferRect.c[2] = size[2];
|
|
}
|
|
// Make sure size in Y dimension is divided by the rectangle size
|
|
if (size[2] > 1) {
|
|
while ((size[1] % xferRect[1]) != 0) {
|
|
xferRect.c[1]--;
|
|
}
|
|
}
|
|
|
|
// Find one step copy size, based on the copy rectange
|
|
amd::Coord3D oneStepSize(
|
|
xferRect[0] * xferRect[1] * xferRect[2] * gpuMem(dstMemory).elementSize());
|
|
|
|
// Initialize transfer buffer array
|
|
Memory* xferBuf[MaxXferBuffers];
|
|
for (uint i = 0; i < MaxXferBuffers; ++i) {
|
|
xferBuf[i] = dev().getGpuMemory(xferBuffers_[i]);
|
|
if (xferBuf[i] == NULL) {
|
|
transfer = false;
|
|
break;
|
|
}
|
|
}
|
|
|
|
// Loop until we transfer all data
|
|
while (transfer && (copySize > 0)) {
|
|
size_t copySizeTmp = copySize;
|
|
amd::Coord3D srcTmp(src);
|
|
amd::Coord3D oneStepSizeTmp(oneStepSize);
|
|
// Step 1. Initiate DRM transfer with all staging buffers
|
|
for (uint i = 0; i < MaxXferBuffers; ++i) {
|
|
// Make sure we don't transfer more than copy size
|
|
if (copySizeTmp > 0) {
|
|
if (!gpuMem(srcMemory).partialMemCopyTo(gpu(), srcTmp,
|
|
xferSrc, oneStepSizeTmp, *xferBuf[i], CopyRect, FlushDMA)) {
|
|
transfer = false;
|
|
break;
|
|
}
|
|
|
|
copySizeTmp -= oneStepSizeTmp[0];
|
|
// Change buffer offset
|
|
srcTmp.c[0] += oneStepSizeTmp[0];
|
|
|
|
if (copySizeTmp < oneStepSizeTmp[0]) {
|
|
oneStepSizeTmp.c[0] = copySizeTmp;
|
|
}
|
|
}
|
|
else {
|
|
break;
|
|
}
|
|
}
|
|
|
|
// Step 2. Initiate compute transfer with all staging buffers
|
|
for (uint i = 0; i < MaxXferBuffers; ++i) {
|
|
if (copySize > 0) {
|
|
if (!copyBufferToImageKernel(
|
|
*xferBuf[i], dstMemory,
|
|
xferSrc, dst, xferRect, false)) {
|
|
transfer = false;
|
|
break;
|
|
}
|
|
gpu().flushDMA(MainEngine);
|
|
|
|
copySize -= oneStepSize[0];
|
|
// Change buffer offset
|
|
src.c[0] += oneStepSize[0];
|
|
// Change image offset, ignore X offset
|
|
for (uint j = 1; j < 3; ++j) {
|
|
dst.c[j] += xferRect[j];
|
|
if ((dst[j] - dstOrigin[j]) >= size[j]) {
|
|
dst.c[j] = dstOrigin[j];
|
|
}
|
|
else {
|
|
break;
|
|
}
|
|
}
|
|
// Recalculate rectangle size if the remain data is smaller
|
|
if (copySize < oneStepSize[0]) {
|
|
for (uint j = 0; j < 3; ++j) {
|
|
xferRect.c[j] = size[j] - (dst[j] - dstOrigin[j]);
|
|
}
|
|
oneStepSize.c[0] = copySize;
|
|
}
|
|
}
|
|
else {
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (copySize == 0) {
|
|
result = true;
|
|
}
|
|
else {
|
|
LogWarning("2 step transfer in copyBufferToImage failed");
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if (!result) {
|
|
result = copyBufferToImageKernel(srcMemory,
|
|
dstMemory, srcOrigin, dstOrigin, size, entire, rowPitch, slicePitch);
|
|
}
|
|
|
|
synchronize();
|
|
|
|
return result;
|
|
}
|
|
|
|
void
|
|
CalcRowSlicePitches(
|
|
cl_int* pitch, const cl_int* copySize,
|
|
size_t rowPitch, size_t slicePitch, const Memory& mem)
|
|
{
|
|
size_t memFmtSize = memoryFormatSize(mem.cal()->format_).size_;
|
|
bool img1Darray = (mem.cal()->dimension_ == GSL_MOA_TEXTURE_1D_ARRAY) ? true : false;
|
|
|
|
if (rowPitch == 0) {
|
|
pitch[0] = copySize[0];
|
|
}
|
|
else {
|
|
pitch[0] = rowPitch / memFmtSize;
|
|
}
|
|
if (slicePitch == 0) {
|
|
pitch[1] = pitch[0] * (img1Darray ? 1 : copySize[1]);
|
|
}
|
|
else {
|
|
pitch[1] = slicePitch / memFmtSize;
|
|
}
|
|
assert((pitch[0] <= pitch[1]) && "rowPitch must be <= slicePitch");
|
|
|
|
if (img1Darray) {
|
|
// For 1D array rowRitch = slicePitch
|
|
pitch[0] = pitch[1];
|
|
}
|
|
}
|
|
|
|
static void
|
|
setArgument(amd::Kernel* kernel, size_t index, size_t size, const void* value)
|
|
{
|
|
const amd::KernelParameterDescriptor& desc = kernel->signature().at(index);
|
|
|
|
void* param = kernel->parameters().values() + desc.offset_;
|
|
assert((desc.type_ == T_POINTER || value != NULL || desc.size_ == 0) &&
|
|
"not a valid local mem arg");
|
|
|
|
uint32_t uint32_value = 0;
|
|
uint64_t uint64_value = 0;
|
|
|
|
if (desc.type_ == T_POINTER && desc.size_ != 0) {
|
|
if ((value == NULL) || (static_cast<const cl_mem*>(value) == NULL)) {
|
|
LP64_SWITCH(uint32_value, uint64_value) = 0;
|
|
}
|
|
else {
|
|
// convert cl_mem to amd::Memory*, return false if invalid.
|
|
LP64_SWITCH(uint32_value, uint64_value) =
|
|
(uintptr_t)(*static_cast<Memory* const *>(value));
|
|
}
|
|
}
|
|
else if (desc.type_ == T_SAMPLER) {
|
|
assert(false && "No sampler support in blit manager! Use internal samplers!");
|
|
}
|
|
else switch (desc.size_) {
|
|
case 1: uint32_value = *static_cast<const uint8_t*>(value); break;
|
|
case 2: uint32_value = *static_cast<const uint16_t*>(value); break;
|
|
case 4: uint32_value = *static_cast<const uint32_t*>(value); break;
|
|
case 8: uint64_value = *static_cast<const uint64_t*>(value); break;
|
|
default: break;
|
|
}
|
|
|
|
switch (desc.size_) {
|
|
case 0 /*local mem*/ : *static_cast<size_t*>(param) = size; break;
|
|
case sizeof(uint32_t): *static_cast<uint32_t*>(param) = uint32_value; break;
|
|
case sizeof(uint64_t): *static_cast<uint64_t*>(param) = uint64_value; break;
|
|
default: ::memcpy(param, value, size); break;
|
|
}
|
|
}
|
|
|
|
bool
|
|
KernelBlitManager::copyBufferToImageKernel(
|
|
device::Memory& srcMemory,
|
|
device::Memory& dstMemory,
|
|
const amd::Coord3D& srcOrigin,
|
|
const amd::Coord3D& dstOrigin,
|
|
const amd::Coord3D& size,
|
|
bool entire,
|
|
size_t rowPitch,
|
|
size_t slicePitch) const
|
|
{
|
|
bool rejected = false;
|
|
Memory* dstView = &gpuMem(dstMemory);
|
|
bool releaseView = false;
|
|
bool result = false;
|
|
CalFormat imgFormat;
|
|
imgFormat.channelOrder_ = gpuMem(dstMemory).cal()->channelOrder_;
|
|
imgFormat.type_ = gpuMem(dstMemory).cal()->format_;
|
|
amd::Image::Format newFormat(dev().getOclFormat(imgFormat));
|
|
|
|
// Find unsupported formats
|
|
for (uint i = 0; i < RejectedFormatDataTotal; ++i) {
|
|
if (RejectedData[i].clOldType_ == newFormat.image_channel_data_type) {
|
|
newFormat.image_channel_data_type = RejectedData[i].clNewType_;
|
|
rejected = true;
|
|
break;
|
|
}
|
|
}
|
|
|
|
// Find unsupported channel's order
|
|
for (uint i = 0; i < RejectedFormatChannelTotal; ++i) {
|
|
if (RejectedOrder[i].clOldType_ == newFormat.image_channel_order) {
|
|
newFormat.image_channel_order = RejectedOrder[i].clNewType_;
|
|
rejected = true;
|
|
break;
|
|
}
|
|
}
|
|
|
|
// If the image format was rejected, then attempt to create a view
|
|
if (rejected) {
|
|
dstView = createView(gpuMem(dstMemory), dev().getCalFormat(newFormat));
|
|
if (dstView != NULL) {
|
|
rejected = false;
|
|
releaseView = true;
|
|
}
|
|
}
|
|
|
|
// Fall into the host path if the image format was rejected
|
|
if (rejected) {
|
|
return HostBlitManager::copyBufferToImage(
|
|
srcMemory, dstMemory, srcOrigin, dstOrigin, size, entire);
|
|
}
|
|
|
|
// Use a common blit type with three dimensions by default
|
|
uint blitType = BlitCopyBufferToImage;
|
|
size_t dim = 0;
|
|
size_t globalWorkOffset[3] = { 0, 0, 0 };
|
|
size_t globalWorkSize[3];
|
|
size_t localWorkSize[3];
|
|
bool swapLayer = (gpuMem(dstMemory).cal()->dimension_ == GSL_MOA_TEXTURE_1D_ARRAY) &&
|
|
!dev().settings().siPlus_;
|
|
|
|
// Program the kernels workload depending on the blit dimensions
|
|
dim = 3;
|
|
if (gpuMem(dstMemory).cal()->dimSize_ == 1) {
|
|
globalWorkSize[0] = amd::alignUp(size[0], 256);
|
|
globalWorkSize[1] = amd::alignUp(size[1], 1);
|
|
globalWorkSize[2] = amd::alignUp(size[2], 1);
|
|
localWorkSize[0] = 256;
|
|
localWorkSize[1] = localWorkSize[2] = 1;
|
|
}
|
|
else if (gpuMem(dstMemory).cal()->dimSize_ == 2) {
|
|
globalWorkSize[0] = amd::alignUp(size[0], 16);
|
|
globalWorkSize[1] = amd::alignUp(size[1], 16);
|
|
globalWorkSize[2] = amd::alignUp(size[2], 1);
|
|
localWorkSize[0] = localWorkSize[1] = 16;
|
|
localWorkSize[2] = 1;
|
|
// Swap the Y and Z components, apparently HW expects
|
|
// layer in Z
|
|
if (swapLayer) {
|
|
globalWorkSize[2] = globalWorkSize[1];
|
|
globalWorkSize[1] = 1;
|
|
localWorkSize[2] = localWorkSize[1];
|
|
localWorkSize[1] = 1;
|
|
}
|
|
}
|
|
else {
|
|
globalWorkSize[0] = amd::alignUp(size[0], 8);
|
|
globalWorkSize[1] = amd::alignUp(size[1], 8);
|
|
globalWorkSize[2] = amd::alignUp(size[2], 4);
|
|
localWorkSize[0] = localWorkSize[1] = 8;
|
|
localWorkSize[2] = 4;
|
|
}
|
|
|
|
// Program kernels arguments for the blit operation
|
|
Memory* mem = &gpuMem(srcMemory);
|
|
setArgument(kernels_[blitType], 0, sizeof(cl_mem), &mem);
|
|
mem = dstView;
|
|
setArgument(kernels_[blitType], 1, sizeof(cl_mem), &mem);
|
|
const MemFormatStruct& memFmt = memoryFormatSize(gpuMem(dstMemory).cal()->format_);
|
|
|
|
// 1 element granularity for writes by default
|
|
cl_int granularity = 1;
|
|
if (memFmt.size_ == 2) {
|
|
granularity = 2;
|
|
}
|
|
else if (memFmt.size_ >= 4) {
|
|
granularity = 4;
|
|
}
|
|
CondLog(((srcOrigin[0] % granularity) != 0), "Unaligned offset in blit!");
|
|
cl_int srcOrg[4] = { (cl_int)srcOrigin[0] / granularity,
|
|
(cl_int)srcOrigin[1],
|
|
(cl_int)srcOrigin[2], 0 };
|
|
setArgument(kernels_[blitType], 2, sizeof(srcOrg), srcOrg);
|
|
|
|
cl_int dstOrg[4] = { (cl_int)dstOrigin[0],
|
|
(cl_int)dstOrigin[1],
|
|
(cl_int)dstOrigin[2], 0 };
|
|
cl_int copySize[4] = { (cl_int)size[0],
|
|
(cl_int)size[1],
|
|
(cl_int)size[2], 0 };
|
|
if (swapLayer) {
|
|
dstOrg[2] = dstOrg[1];
|
|
dstOrg[1] = 0;
|
|
copySize[2] = copySize[1];
|
|
copySize[1] = 1;
|
|
}
|
|
setArgument(kernels_[blitType], 3, sizeof(dstOrg), dstOrg);
|
|
setArgument(kernels_[blitType], 4, sizeof(copySize), copySize);
|
|
|
|
// Program memory format
|
|
uint multiplier = memFmt.size_ / sizeof(uint32_t);
|
|
multiplier = (multiplier == 0) ? 1 : multiplier;
|
|
cl_int format[4] = { (cl_int)memFmt.components_,
|
|
(cl_int)memFmt.size_ / (cl_int)memFmt.components_,
|
|
(cl_int)multiplier, 0 };
|
|
setArgument(kernels_[blitType], 5, sizeof(format), format);
|
|
|
|
// Program row and slice pitches
|
|
cl_int pitch[4] = { 0 };
|
|
CalcRowSlicePitches(pitch, copySize, rowPitch, slicePitch, gpuMem(dstMemory));
|
|
setArgument(kernels_[blitType], 6, sizeof(pitch), pitch);
|
|
|
|
// Create ND range object for the kernel's execution
|
|
amd::NDRangeContainer ndrange(dim,
|
|
globalWorkOffset, globalWorkSize, localWorkSize);
|
|
|
|
// Execute the blit
|
|
address parameters = kernels_[blitType]->parameters().values();
|
|
result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], parameters);
|
|
if (releaseView) {
|
|
delete dstView;
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
bool
|
|
KernelBlitManager::copyImageToBuffer(
|
|
device::Memory& srcMemory,
|
|
device::Memory& dstMemory,
|
|
const amd::Coord3D& srcOrigin,
|
|
const amd::Coord3D& dstOrigin,
|
|
const amd::Coord3D& size,
|
|
bool entire,
|
|
size_t rowPitch,
|
|
size_t slicePitch) const
|
|
{
|
|
amd::ScopedLock k(lockXferOps_);
|
|
bool result = false;
|
|
static const bool CopyRect = false;
|
|
// Flush DMA for ASYNC copy
|
|
static const bool FlushDMA = true;
|
|
|
|
if (setup_.disableCopyImageToBuffer_) {
|
|
result = HostBlitManager::copyImageToBuffer(
|
|
srcMemory, dstMemory, srcOrigin, dstOrigin,
|
|
size, entire, rowPitch, slicePitch);
|
|
synchronize();
|
|
return result;
|
|
}
|
|
// Check if buffer is in system memory with direct access
|
|
else if (gpuMem(dstMemory).isHostMemDirectAccess() &&
|
|
(rowPitch == 0) && (slicePitch == 0)) {
|
|
// First attempt to do this all with DMA,
|
|
// but there are restriciton with older hardware
|
|
if (dev().settings().imageDMA_) {
|
|
result = DmaBlitManager::copyImageToBuffer(
|
|
srcMemory, dstMemory, srcOrigin, dstOrigin,
|
|
size, entire, rowPitch, slicePitch);
|
|
if (result) {
|
|
synchronize();
|
|
return result;
|
|
}
|
|
}
|
|
|
|
// Find the overall copy size
|
|
size_t copySize = size[0] * size[1] * size[2] * gpuMem(srcMemory).elementSize();
|
|
|
|
// Check if double copy was requested
|
|
if (xferBufferSize_ != 0) {
|
|
amd::Coord3D src(srcOrigin);
|
|
amd::Coord3D dst(dstOrigin);
|
|
amd::Coord3D xferDst(0, 0, 0);
|
|
amd::Coord3D xferRect(size);
|
|
// Find transfer size in pixels
|
|
size_t xferSizePix = xferBufferSize_ / gpuMem(srcMemory).elementSize();
|
|
bool transfer = true;
|
|
|
|
// Find transfer rectangle
|
|
if (xferRect[0] > xferSizePix) {
|
|
// The algorithm can't break a line.
|
|
// It requires multiple rectangles tracking
|
|
transfer = false;
|
|
}
|
|
else {
|
|
xferRect.c[1] = xferSizePix / xferRect[0];
|
|
}
|
|
// Check if we exceeded the original size boundary in Y
|
|
if (xferRect[1] > size[1]) {
|
|
xferRect.c[1] = size[1];
|
|
xferRect.c[2] = xferSizePix / (xferRect[0] * xferRect[1]);
|
|
}
|
|
else {
|
|
xferRect.c[2] = 1;
|
|
}
|
|
// Check if we exceeded the original size boundary in Z
|
|
if (xferRect[2] > size[2]) {
|
|
xferRect.c[2] = size[2];
|
|
}
|
|
// Make sure size in Y dimension is divided by the rectangle size
|
|
if (size[2] > 1) {
|
|
while ((size[1] % xferRect[1]) != 0) {
|
|
xferRect.c[1]--;
|
|
}
|
|
}
|
|
|
|
// Find one step copy size, based on the copy rectange
|
|
amd::Coord3D oneStepSize(
|
|
xferRect[0] * xferRect[1] * xferRect[2] * gpuMem(srcMemory).elementSize());
|
|
|
|
// Initialize transfer buffer array
|
|
Memory* xferBuf[MaxXferBuffers];
|
|
for (uint i = 0; i < MaxXferBuffers; ++i) {
|
|
xferBuf[i] = dev().getGpuMemory(xferBuffers_[i]);
|
|
if (xferBuf[i] == NULL) {
|
|
transfer = false;
|
|
break;
|
|
}
|
|
}
|
|
|
|
// Loop until we transfer all data
|
|
while (transfer && (copySize > 0)) {
|
|
size_t copySizeTmp = copySize;
|
|
amd::Coord3D srcTmp(src);
|
|
amd::Coord3D oneStepSizeTmp(oneStepSize);
|
|
amd::Coord3D xferRectTmp(xferRect);
|
|
|
|
// Step 1. Initiate compute transfer with all staging buffers
|
|
for (uint i = 0; i < MaxXferBuffers; ++i) {
|
|
if (copySizeTmp > 0) {
|
|
if (!copyImageToBufferKernel(
|
|
srcMemory, *xferBuf[i],
|
|
srcTmp, xferDst, xferRectTmp, false)) {
|
|
transfer = false;
|
|
break;
|
|
}
|
|
gpu().flushDMA(MainEngine);
|
|
|
|
copySizeTmp -= oneStepSizeTmp[0];
|
|
// Change image offset, ignore X offset
|
|
for (uint j = 1; j < 3; ++j) {
|
|
srcTmp.c[j] += xferRectTmp[j];
|
|
if ((srcTmp[j] - srcOrigin[j]) >= size[j]) {
|
|
srcTmp.c[j] = srcOrigin[j];
|
|
}
|
|
else {
|
|
break;
|
|
}
|
|
}
|
|
// Recalculate rectangle size if the remain data is smaller
|
|
if (copySizeTmp < oneStepSizeTmp[0]) {
|
|
for (uint j = 0; j < 3; ++j) {
|
|
xferRectTmp.c[j] = size[j] - (srcTmp[j] - srcOrigin[j]);
|
|
}
|
|
}
|
|
}
|
|
else {
|
|
break;
|
|
}
|
|
}
|
|
|
|
// Step 2. Initiate DRM transfer with all staging buffers
|
|
for (uint i = 0; i < MaxXferBuffers; ++i) {
|
|
// Make sure we don't transfer more than copy size
|
|
if (copySize > 0) {
|
|
if (!xferBuf[i]->partialMemCopyTo(gpu(), xferDst, dst,
|
|
oneStepSize, gpuMem(dstMemory), CopyRect, FlushDMA)) {
|
|
transfer = false;
|
|
break;
|
|
}
|
|
|
|
copySize -= oneStepSize[0];
|
|
// Change buffer offset
|
|
dst.c[0] += oneStepSize[0];
|
|
// Change image offset, ignore X offset
|
|
for (uint j = 1; j < 3; ++j) {
|
|
src.c[j] += xferRect[j];
|
|
if ((src[j] - srcOrigin[j]) >= size[j]) {
|
|
src.c[j] = srcOrigin[j];
|
|
}
|
|
else {
|
|
break;
|
|
}
|
|
}
|
|
// Recalculate rectangle size if the remain data is smaller
|
|
if (copySize < oneStepSize[0]) {
|
|
for (uint j = 0; j < 3; ++j) {
|
|
xferRect.c[j] = size[j] - (src[j] - srcOrigin[j]);
|
|
}
|
|
oneStepSize.c[0] = copySize;
|
|
}
|
|
}
|
|
else {
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (copySize == 0) {
|
|
result = true;
|
|
}
|
|
else {
|
|
LogWarning("2 step transfer in copyBufferToImage failed");
|
|
}
|
|
}
|
|
}
|
|
|
|
if (!result) {
|
|
result = copyImageToBufferKernel(srcMemory,
|
|
dstMemory, srcOrigin, dstOrigin, size, entire, rowPitch, slicePitch);
|
|
}
|
|
|
|
synchronize();
|
|
|
|
return result;
|
|
}
|
|
|
|
bool
|
|
KernelBlitManager::copyImageToBufferKernel(
|
|
device::Memory& srcMemory,
|
|
device::Memory& dstMemory,
|
|
const amd::Coord3D& srcOrigin,
|
|
const amd::Coord3D& dstOrigin,
|
|
const amd::Coord3D& size,
|
|
bool entire,
|
|
size_t rowPitch,
|
|
size_t slicePitch) const
|
|
{
|
|
bool rejected = false;
|
|
Memory* srcView = &gpuMem(srcMemory);
|
|
bool releaseView = false;
|
|
bool result = false;
|
|
CalFormat imgFormat;
|
|
imgFormat.channelOrder_ = gpuMem(srcMemory).cal()->channelOrder_;
|
|
imgFormat.type_ = gpuMem(srcMemory).cal()->format_;
|
|
amd::Image::Format newFormat(dev().getOclFormat(imgFormat));
|
|
|
|
// Find unsupported formats
|
|
for (uint i = 0; i < RejectedFormatDataTotal; ++i) {
|
|
if (RejectedData[i].clOldType_ == newFormat.image_channel_data_type) {
|
|
newFormat.image_channel_data_type = RejectedData[i].clNewType_;
|
|
rejected = true;
|
|
break;
|
|
}
|
|
}
|
|
|
|
// Find unsupported channel's order
|
|
for (uint i = 0; i < RejectedFormatChannelTotal; ++i) {
|
|
if (RejectedOrder[i].clOldType_ == newFormat.image_channel_order) {
|
|
newFormat.image_channel_order = RejectedOrder[i].clNewType_;
|
|
rejected = true;
|
|
break;
|
|
}
|
|
}
|
|
|
|
// If the image format was rejected, then attempt to create a view
|
|
if (rejected) {
|
|
srcView = createView(gpuMem(srcMemory), dev().getCalFormat(newFormat));
|
|
if (srcView != NULL) {
|
|
rejected = false;
|
|
releaseView = true;
|
|
}
|
|
}
|
|
|
|
// Fall into the host path if the image format was rejected
|
|
if (rejected) {
|
|
return HostBlitManager::copyImageToBuffer(
|
|
srcMemory, dstMemory, srcOrigin, dstOrigin, size, entire);
|
|
}
|
|
|
|
uint blitType = BlitCopyImageToBuffer;
|
|
size_t dim = 0;
|
|
size_t globalWorkOffset[3] = { 0, 0, 0 };
|
|
size_t globalWorkSize[3];
|
|
size_t localWorkSize[3];
|
|
bool swapLayer = (gpuMem(srcMemory).cal()->dimension_ == GSL_MOA_TEXTURE_1D_ARRAY) &&
|
|
!dev().settings().siPlus_;
|
|
|
|
// Program the kernels workload depending on the blit dimensions
|
|
dim = 3;
|
|
// Find the current blit type
|
|
if (gpuMem(srcMemory).cal()->dimSize_ == 1) {
|
|
globalWorkSize[0] = amd::alignUp(size[0], 256);
|
|
globalWorkSize[1] = amd::alignUp(size[1], 1);
|
|
globalWorkSize[2] = amd::alignUp(size[2], 1);
|
|
localWorkSize[0] = 256;
|
|
localWorkSize[1] = localWorkSize[2] = 1;
|
|
}
|
|
else if (gpuMem(srcMemory).cal()->dimSize_ == 2) {
|
|
globalWorkSize[0] = amd::alignUp(size[0], 16);
|
|
globalWorkSize[1] = amd::alignUp(size[1], 16);
|
|
globalWorkSize[2] = amd::alignUp(size[2], 1);
|
|
localWorkSize[0] = localWorkSize[1] = 16;
|
|
localWorkSize[2] = 1;
|
|
// Swap the Y and Z components, apparently HW expects
|
|
// layer in Z
|
|
if (swapLayer) {
|
|
globalWorkSize[2] = globalWorkSize[1];
|
|
globalWorkSize[1] = 1;
|
|
localWorkSize[2] = localWorkSize[1];
|
|
localWorkSize[1] = 1;
|
|
}
|
|
}
|
|
else {
|
|
globalWorkSize[0] = amd::alignUp(size[0], 8);
|
|
globalWorkSize[1] = amd::alignUp(size[1], 8);
|
|
globalWorkSize[2] = amd::alignUp(size[2], 4);
|
|
localWorkSize[0] = localWorkSize[1] = 8;
|
|
localWorkSize[2] = 4;
|
|
}
|
|
|
|
// Program kernels arguments for the blit operation
|
|
Memory* mem = srcView;
|
|
setArgument(kernels_[blitType], 0, sizeof(cl_mem), &mem);
|
|
mem = &gpuMem(dstMemory);
|
|
setArgument(kernels_[blitType], 1, sizeof(cl_mem), &mem);
|
|
|
|
// Update extra paramters for USHORT and UBYTE pointers.
|
|
// Only then compiler can optimize the kernel to use
|
|
// UAV Raw for other writes
|
|
setArgument(kernels_[blitType], 2, sizeof(cl_mem), &mem);
|
|
setArgument(kernels_[blitType], 3, sizeof(cl_mem), &mem);
|
|
|
|
cl_int srcOrg[4] = { (cl_int)srcOrigin[0],
|
|
(cl_int)srcOrigin[1],
|
|
(cl_int)srcOrigin[2], 0 };
|
|
cl_int copySize[4] = { (cl_int)size[0],
|
|
(cl_int)size[1],
|
|
(cl_int)size[2], 0 };
|
|
if (swapLayer) {
|
|
srcOrg[2] = srcOrg[1];
|
|
srcOrg[1] = 0;
|
|
copySize[2] = copySize[1];
|
|
copySize[1] = 1;
|
|
}
|
|
setArgument(kernels_[blitType], 4, sizeof(srcOrg), srcOrg);
|
|
const MemFormatStruct& memFmt = memoryFormatSize(gpuMem(srcMemory).cal()->format_);
|
|
|
|
// 1 element granularity for writes by default
|
|
cl_int granularity = 1;
|
|
if (memFmt.size_ == 2) {
|
|
granularity = 2;
|
|
}
|
|
else if (memFmt.size_ >= 4) {
|
|
granularity = 4;
|
|
}
|
|
CondLog(((dstOrigin[0] % granularity) != 0), "Unaligned offset in blit!");
|
|
cl_int dstOrg[4] = { (cl_int)dstOrigin[0] / granularity,
|
|
(cl_int)dstOrigin[1],
|
|
(cl_int)dstOrigin[2], 0 };
|
|
setArgument(kernels_[blitType], 5, sizeof(dstOrg), dstOrg);
|
|
setArgument(kernels_[blitType], 6, sizeof(copySize), copySize);
|
|
|
|
// Program memory format
|
|
uint multiplier = memFmt.size_ / sizeof(uint32_t);
|
|
multiplier = (multiplier == 0) ? 1 : multiplier;
|
|
cl_int format[4] = { (cl_int)memFmt.components_,
|
|
(cl_int)memFmt.size_ / (cl_int)memFmt.components_,
|
|
(cl_int)multiplier, 0 };
|
|
setArgument(kernels_[blitType], 7, sizeof(format), format);
|
|
|
|
// Program row and slice pitches
|
|
cl_int pitch[4] = { 0 };
|
|
CalcRowSlicePitches(pitch, copySize, rowPitch, slicePitch, gpuMem(srcMemory));
|
|
setArgument(kernels_[blitType], 8, sizeof(pitch), pitch);
|
|
|
|
// Create ND range object for the kernel's execution
|
|
amd::NDRangeContainer ndrange(dim,
|
|
globalWorkOffset, globalWorkSize, localWorkSize);
|
|
|
|
// Execute the blit
|
|
address parameters = kernels_[blitType]->parameters().values();
|
|
result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], parameters);
|
|
if (releaseView) {
|
|
delete srcView;
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
bool
|
|
KernelBlitManager::copyImage(
|
|
device::Memory& srcMemory,
|
|
device::Memory& dstMemory,
|
|
const amd::Coord3D& srcOrigin,
|
|
const amd::Coord3D& dstOrigin,
|
|
const amd::Coord3D& size,
|
|
bool entire) const
|
|
{
|
|
amd::ScopedLock k(lockXferOps_);
|
|
bool rejected = false;
|
|
Memory* srcView = &gpuMem(srcMemory);
|
|
Memory* dstView = &gpuMem(dstMemory);
|
|
bool releaseView = false;
|
|
bool result = false;
|
|
CalFormat imgFormat;
|
|
imgFormat.channelOrder_ = gpuMem(srcMemory).cal()->channelOrder_;
|
|
imgFormat.type_ = gpuMem(srcMemory).cal()->format_;
|
|
amd::Image::Format newFormat(dev().getOclFormat(imgFormat));
|
|
|
|
// Find unsupported formats
|
|
for (uint i = 0; i < RejectedFormatDataTotal; ++i) {
|
|
if (RejectedData[i].clOldType_ == newFormat.image_channel_data_type) {
|
|
newFormat.image_channel_data_type = RejectedData[i].clNewType_;
|
|
rejected = true;
|
|
break;
|
|
}
|
|
}
|
|
|
|
// Search for the rejected channel's order only if the format was rejected
|
|
// Note: Image blit is independent from the channel order
|
|
if (rejected) {
|
|
for (uint i = 0; i < RejectedFormatChannelTotal; ++i) {
|
|
if (RejectedOrder[i].clOldType_ == newFormat.image_channel_order) {
|
|
newFormat.image_channel_order = RejectedOrder[i].clNewType_;
|
|
rejected = true;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Attempt to create a view if the format was rejected
|
|
if (rejected) {
|
|
srcView = createView(gpuMem(srcMemory), dev().getCalFormat(newFormat));
|
|
if (srcView != NULL) {
|
|
dstView = createView(gpuMem(dstMemory), dev().getCalFormat(newFormat));
|
|
if (dstView != NULL) {
|
|
rejected = false;
|
|
releaseView = true;
|
|
}
|
|
else {
|
|
delete srcView;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Fall into the host path for the entire 2D copy or
|
|
// if the image format was rejected
|
|
if (rejected) {
|
|
result = HostBlitManager::copyImage(srcMemory, dstMemory,
|
|
srcOrigin, dstOrigin, size, entire);
|
|
synchronize();
|
|
return result;
|
|
}
|
|
|
|
uint blitType = BlitCopyImage;
|
|
size_t dim = 0;
|
|
size_t globalWorkOffset[3] = { 0, 0, 0 };
|
|
size_t globalWorkSize[3];
|
|
size_t localWorkSize[3];
|
|
|
|
// Program the kernels workload depending on the blit dimensions
|
|
dim = 3;
|
|
// Find the current blit type
|
|
if ((gpuMem(srcMemory).cal()->dimSize_ == 1) ||
|
|
(gpuMem(dstMemory).cal()->dimSize_ == 1)) {
|
|
globalWorkSize[0] = amd::alignUp(size[0], 256);
|
|
globalWorkSize[1] = amd::alignUp(size[1], 1);
|
|
globalWorkSize[2] = amd::alignUp(size[2], 1);
|
|
localWorkSize[0] = 256;
|
|
localWorkSize[1] = localWorkSize[2] = 1;
|
|
}
|
|
else if ((gpuMem(srcMemory).cal()->dimSize_ == 2) ||
|
|
(gpuMem(dstMemory).cal()->dimSize_ == 2)) {
|
|
globalWorkSize[0] = amd::alignUp(size[0], 16);
|
|
globalWorkSize[1] = amd::alignUp(size[1], 16);
|
|
globalWorkSize[2] = amd::alignUp(size[2], 1);
|
|
localWorkSize[0] = localWorkSize[1] = 16;
|
|
localWorkSize[2] = 1;
|
|
}
|
|
else {
|
|
globalWorkSize[0] = amd::alignUp(size[0], 8);
|
|
globalWorkSize[1] = amd::alignUp(size[1], 8);
|
|
globalWorkSize[2] = amd::alignUp(size[2], 4);
|
|
localWorkSize[0] = localWorkSize[1] = 8;
|
|
localWorkSize[2] = 4;
|
|
}
|
|
|
|
// The current OpenCL spec allows "copy images from a 1D image
|
|
// array object to a 1D image array object" only.
|
|
if ((gpuMem(srcMemory).cal()->dimension_ == GSL_MOA_TEXTURE_1D_ARRAY) ||
|
|
(gpuMem(dstMemory).cal()->dimension_ == GSL_MOA_TEXTURE_1D_ARRAY)) {
|
|
blitType = BlitCopyImage1DA;
|
|
}
|
|
|
|
// Program kernels arguments for the blit operation
|
|
Memory* mem = srcView;
|
|
setArgument(kernels_[blitType], 0, sizeof(cl_mem), &mem);
|
|
mem = dstView;
|
|
setArgument(kernels_[blitType], 1, sizeof(cl_mem), &mem);
|
|
|
|
// Program source origin
|
|
cl_int srcOrg[4] = { (cl_int)srcOrigin[0],
|
|
(cl_int)srcOrigin[1],
|
|
(cl_int)srcOrigin[2], 0 };
|
|
if ((gpuMem(srcMemory).cal()->dimension_ == GSL_MOA_TEXTURE_1D_ARRAY) &&
|
|
!dev().settings().siPlus_) {
|
|
srcOrg[3] = 1;
|
|
}
|
|
setArgument(kernels_[blitType], 2, sizeof(srcOrg), srcOrg);
|
|
|
|
// Program destinaiton origin
|
|
cl_int dstOrg[4] = { (cl_int)dstOrigin[0],
|
|
(cl_int)dstOrigin[1],
|
|
(cl_int)dstOrigin[2], 0 };
|
|
if ((gpuMem(dstMemory).cal()->dimension_ == GSL_MOA_TEXTURE_1D_ARRAY) &&
|
|
!dev().settings().siPlus_) {
|
|
dstOrg[3] = 1;
|
|
}
|
|
setArgument(kernels_[blitType], 3, sizeof(dstOrg), dstOrg);
|
|
|
|
cl_int copySize[4] = { (cl_int)size[0],
|
|
(cl_int)size[1],
|
|
(cl_int)size[2], 0 };
|
|
setArgument(kernels_[blitType], 4, sizeof(copySize), copySize);
|
|
|
|
// Create ND range object for the kernel's execution
|
|
amd::NDRangeContainer ndrange(dim,
|
|
globalWorkOffset, globalWorkSize, localWorkSize);
|
|
|
|
// Execute the blit
|
|
address parameters = kernels_[blitType]->parameters().values();
|
|
result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], parameters);
|
|
if (releaseView) {
|
|
delete srcView;
|
|
delete dstView;
|
|
}
|
|
|
|
synchronize();
|
|
|
|
return result;
|
|
}
|
|
|
|
void
|
|
FindPinSize(
|
|
size_t& pinSize, const amd::Coord3D& size,
|
|
size_t& rowPitch, size_t& slicePitch, const Memory& mem)
|
|
{
|
|
pinSize = size[0] * mem.elementSize();
|
|
if ((rowPitch == 0) || (rowPitch == pinSize)) {
|
|
rowPitch = 0;
|
|
}
|
|
else {
|
|
pinSize = rowPitch;
|
|
}
|
|
|
|
// Calculate the pin size, which should be equal to the copy size
|
|
for (uint i = 1; i < mem.cal()->dimSize_; ++i) {
|
|
pinSize *= size[i];
|
|
if (i == 1) {
|
|
if ((slicePitch == 0) || (slicePitch == pinSize)) {
|
|
slicePitch = 0;
|
|
}
|
|
else {
|
|
if (mem.cal()->dimension_ != GSL_MOA_TEXTURE_1D_ARRAY) {
|
|
pinSize = slicePitch;
|
|
}
|
|
else {
|
|
pinSize = slicePitch * size[i];
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
bool
|
|
KernelBlitManager::readImage(
|
|
device::Memory& srcMemory,
|
|
void* dstHost,
|
|
const amd::Coord3D& origin,
|
|
const amd::Coord3D& size,
|
|
size_t rowPitch,
|
|
size_t slicePitch,
|
|
bool entire) const
|
|
{
|
|
amd::ScopedLock k(lockXferOps_);
|
|
bool result = false;
|
|
|
|
// Use host copy if memory has direct access or it's persistent
|
|
if (setup_.disableReadImage_ ||
|
|
(gpuMem(srcMemory).isHostMemDirectAccess() && gpuMem(srcMemory).isCacheable())) {
|
|
result = HostBlitManager::readImage(srcMemory, dstHost,
|
|
origin, size, rowPitch, slicePitch, entire);
|
|
synchronize();
|
|
return result;
|
|
}
|
|
else {
|
|
size_t pinSize;
|
|
FindPinSize(pinSize, size, rowPitch, slicePitch, gpuMem(srcMemory));
|
|
|
|
size_t partial;
|
|
amd::Memory* amdMemory = pinHostMemory(dstHost, pinSize, partial);
|
|
|
|
if (amdMemory == NULL) {
|
|
// Force SW copy
|
|
result = HostBlitManager::readImage(srcMemory, dstHost,
|
|
origin, size, rowPitch, slicePitch, entire);
|
|
synchronize();
|
|
return result;
|
|
}
|
|
|
|
// Readjust destination offset
|
|
const amd::Coord3D dstOrigin(partial);
|
|
|
|
// Get device memory for this virtual device
|
|
Memory* dstMemory = dev().getGpuMemory(amdMemory);
|
|
|
|
// Copy image to buffer
|
|
result = copyImageToBuffer(srcMemory, *dstMemory,
|
|
origin, dstOrigin, size, entire, rowPitch, slicePitch);
|
|
|
|
// Add pinned memory for a later release
|
|
gpu().addPinnedMem(amdMemory);
|
|
}
|
|
|
|
synchronize();
|
|
|
|
return result;
|
|
}
|
|
|
|
bool
|
|
KernelBlitManager::writeImage(
|
|
const void* srcHost,
|
|
device::Memory& dstMemory,
|
|
const amd::Coord3D& origin,
|
|
const amd::Coord3D& size,
|
|
size_t rowPitch,
|
|
size_t slicePitch,
|
|
bool entire) const
|
|
{
|
|
amd::ScopedLock k(lockXferOps_);
|
|
bool result = false;
|
|
|
|
// Use host copy if memory has direct access or it's persistent
|
|
if (setup_.disableWriteImage_||
|
|
gpuMem(dstMemory).isHostMemDirectAccess() ||
|
|
gpuMem(dstMemory).isPersistentDirectMap()) {
|
|
result = HostBlitManager::writeImage(
|
|
srcHost, dstMemory, origin, size, rowPitch, slicePitch, entire);
|
|
synchronize();
|
|
return result;
|
|
}
|
|
else {
|
|
size_t pinSize;
|
|
FindPinSize(pinSize, size, rowPitch, slicePitch, gpuMem(dstMemory));
|
|
|
|
size_t partial;
|
|
amd::Memory* amdMemory = pinHostMemory(srcHost, pinSize, partial);
|
|
|
|
if (amdMemory == NULL) {
|
|
// Force SW copy
|
|
result = HostBlitManager::writeImage(
|
|
srcHost, dstMemory, origin, size, rowPitch, slicePitch, entire);
|
|
synchronize();
|
|
return result;
|
|
}
|
|
|
|
// Readjust destination offset
|
|
const amd::Coord3D srcOrigin(partial);
|
|
|
|
// Get device memory for this virtual device
|
|
Memory* srcMemory = dev().getGpuMemory(amdMemory);
|
|
|
|
// Copy image to buffer
|
|
result = copyBufferToImage(*srcMemory, dstMemory,
|
|
srcOrigin, origin, size, entire, rowPitch, slicePitch);
|
|
|
|
// Add pinned memory for a later release
|
|
gpu().addPinnedMem(amdMemory);
|
|
}
|
|
|
|
synchronize();
|
|
|
|
return result;
|
|
}
|
|
|
|
bool
|
|
KernelBlitManager::copyBufferRect(
|
|
device::Memory& srcMemory,
|
|
device::Memory& dstMemory,
|
|
const amd::BufferRect& srcRectIn,
|
|
const amd::BufferRect& dstRectIn,
|
|
const amd::Coord3D& sizeIn,
|
|
bool entire) const
|
|
{
|
|
amd::ScopedLock k(lockXferOps_);
|
|
bool result = false;
|
|
bool rejected = false;
|
|
|
|
// Fall into the CAL path for rejected transfers
|
|
if (setup_.disableCopyBufferRect_ ||
|
|
((gpuMem(srcMemory).isHostMemDirectAccess() || gpuMem(dstMemory).isHostMemDirectAccess()) &&
|
|
dev().settings().rectLinearDMA_) ||
|
|
(!dev().heap()->isVirtual() &&
|
|
((gpuMem(dstMemory).hb() == NULL) || (gpuMem(srcMemory).hb() == NULL)))) {
|
|
// Copy data with CAL (no VM mode only)
|
|
if ((gpuMem(srcMemory).isHostMemDirectAccess() || gpuMem(dstMemory).isHostMemDirectAccess())
|
|
&& dev().settings().rectLinearDMA_) {
|
|
result = DmaBlitManager::copyBufferRect(srcMemory, dstMemory,
|
|
srcRectIn, dstRectIn, sizeIn, entire);
|
|
}
|
|
|
|
if ((!dev().heap()->isVirtual() && ((gpuMem(dstMemory).hb() == NULL) || (gpuMem(srcMemory).hb() == NULL)))
|
|
&& !result) {
|
|
result = HostBlitManager::copyBufferRect(srcMemory, dstMemory,
|
|
srcRectIn, dstRectIn, sizeIn, entire);
|
|
}
|
|
|
|
if (result) {
|
|
synchronize();
|
|
return result;
|
|
}
|
|
}
|
|
|
|
uint blitType = BlitCopyBufferRect;
|
|
size_t dim = 3;
|
|
size_t globalWorkOffset[3] = { 0, 0, 0 };
|
|
size_t globalWorkSize[3];
|
|
size_t localWorkSize[3];
|
|
|
|
const static uint CopyRectAlignment[3] = { 16, 4, 1 };
|
|
|
|
bool aligned;
|
|
uint i;
|
|
for (i = 0; i < sizeof(CopyRectAlignment) / sizeof(uint); i++) {
|
|
// Check source alignments
|
|
aligned = ((srcRectIn.rowPitch_ % CopyRectAlignment[i]) == 0);
|
|
aligned &= ((srcRectIn.slicePitch_ % CopyRectAlignment[i]) == 0);
|
|
aligned &= ((srcRectIn.start_ % CopyRectAlignment[i]) == 0);
|
|
|
|
// Check destination alignments
|
|
aligned &= ((dstRectIn.rowPitch_ % CopyRectAlignment[i]) == 0);
|
|
aligned &= ((dstRectIn.slicePitch_ % CopyRectAlignment[i]) == 0);
|
|
aligned &= ((dstRectIn.start_ % CopyRectAlignment[i]) == 0);
|
|
|
|
// Check copy size alignment in the first dimension
|
|
aligned &= ((sizeIn[0] % CopyRectAlignment[i]) == 0);
|
|
|
|
if (aligned) {
|
|
if (CopyRectAlignment[i] != 1) {
|
|
blitType = BlitCopyBufferRectAligned;
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
|
|
amd::BufferRect srcRect;
|
|
amd::BufferRect dstRect;
|
|
amd::Coord3D size(sizeIn[0], sizeIn[1], sizeIn[2]);
|
|
|
|
srcRect.rowPitch_ = srcRectIn.rowPitch_ / CopyRectAlignment[i];
|
|
srcRect.slicePitch_ = srcRectIn.slicePitch_ / CopyRectAlignment[i];
|
|
srcRect.start_ = srcRectIn.start_ / CopyRectAlignment[i];
|
|
srcRect.end_ = srcRectIn.end_ / CopyRectAlignment[i];
|
|
|
|
dstRect.rowPitch_ = dstRectIn.rowPitch_ / CopyRectAlignment[i];
|
|
dstRect.slicePitch_ = dstRectIn.slicePitch_ / CopyRectAlignment[i];
|
|
dstRect.start_ = dstRectIn.start_ / CopyRectAlignment[i];
|
|
dstRect.end_ = dstRectIn.end_ / CopyRectAlignment[i];
|
|
|
|
size.c[0] /= CopyRectAlignment[i];
|
|
|
|
// Program the kernel's workload depending on the transfer dimensions
|
|
if ((size[1] == 1) && (size[2] == 1)) {
|
|
globalWorkSize[0] = amd::alignUp(size[0], 256);
|
|
globalWorkSize[1] = 1;
|
|
globalWorkSize[2] = 1;
|
|
localWorkSize[0] = 256;
|
|
localWorkSize[1] = 1;
|
|
localWorkSize[2] = 1;
|
|
}
|
|
else if (size[2] == 1) {
|
|
globalWorkSize[0] = amd::alignUp(size[0], 16);
|
|
globalWorkSize[1] = amd::alignUp(size[1], 16);
|
|
globalWorkSize[2] = 1;
|
|
localWorkSize[0] = localWorkSize[1] = 16;
|
|
localWorkSize[2] = 1;
|
|
}
|
|
else {
|
|
globalWorkSize[0] = amd::alignUp(size[0], 8);
|
|
globalWorkSize[1] = amd::alignUp(size[1], 8);
|
|
globalWorkSize[2] = amd::alignUp(size[2], 4);
|
|
localWorkSize[0] = localWorkSize[1] = 8;
|
|
localWorkSize[2] = 4;
|
|
}
|
|
|
|
|
|
// Program kernels arguments for the blit operation
|
|
Memory* mem = &gpuMem(srcMemory);
|
|
setArgument(kernels_[blitType], 0, sizeof(cl_mem), &mem);
|
|
mem = &gpuMem(dstMemory);
|
|
setArgument(kernels_[blitType], 1, sizeof(cl_mem), &mem);
|
|
cl_uint src[4] = { (cl_uint)srcRect.rowPitch_,
|
|
(cl_uint)srcRect.slicePitch_,
|
|
(cl_uint)srcRect.start_, 0 };
|
|
setArgument(kernels_[blitType], 2, sizeof(src), src);
|
|
cl_uint dst[4] = { (cl_uint)dstRect.rowPitch_,
|
|
(cl_uint)dstRect.slicePitch_,
|
|
(cl_uint)dstRect.start_, 0 };
|
|
setArgument(kernels_[blitType], 3, sizeof(dst), dst);
|
|
cl_int copySize[4] = { (cl_int)size[0],
|
|
(cl_int)size[1],
|
|
(cl_int)size[2],
|
|
(cl_int)CopyRectAlignment[i] };
|
|
setArgument(kernels_[blitType], 4, sizeof(copySize), copySize);
|
|
|
|
// Create ND range object for the kernel's execution
|
|
amd::NDRangeContainer ndrange(dim,
|
|
globalWorkOffset, globalWorkSize, localWorkSize);
|
|
|
|
// Execute the blit
|
|
address parameters = kernels_[blitType]->parameters().values();
|
|
result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], parameters);
|
|
|
|
synchronize();
|
|
|
|
return result;
|
|
}
|
|
|
|
bool
|
|
KernelBlitManager::readBuffer(
|
|
device::Memory& srcMemory,
|
|
void* dstHost,
|
|
const amd::Coord3D& origin,
|
|
const amd::Coord3D& size,
|
|
bool entire) const
|
|
{
|
|
amd::ScopedLock k(lockXferOps_);
|
|
bool result = false;
|
|
// Use host copy if memory has direct access
|
|
if (setup_.disableReadBuffer_ ||
|
|
(gpuMem(srcMemory).isHostMemDirectAccess() && gpuMem(srcMemory).isCacheable())) {
|
|
result = HostBlitManager::readBuffer(
|
|
srcMemory, dstHost, origin, size, entire);
|
|
synchronize();
|
|
return result;
|
|
}
|
|
else {
|
|
size_t pinSize = size[0];
|
|
// Check if a pinned transfer can be executed with a single pin
|
|
if ((pinSize <= dev().settings().pinnedXferSize_) &&
|
|
(pinSize > MinSizeForPinnedTransfer)) {
|
|
size_t partial;
|
|
amd::Memory* amdMemory = pinHostMemory(dstHost, pinSize, partial);
|
|
|
|
if (amdMemory == NULL) {
|
|
// Force SW copy
|
|
result = HostBlitManager::readBuffer(
|
|
srcMemory, dstHost, origin, size, entire);
|
|
synchronize();
|
|
return result;
|
|
}
|
|
|
|
// Readjust host mem offset
|
|
amd::Coord3D dstOrigin(partial);
|
|
|
|
// Get device memory for this virtual device
|
|
Memory* dstMemory = dev().getGpuMemory(amdMemory);
|
|
|
|
// Copy image to buffer
|
|
result = copyBuffer(srcMemory, *dstMemory,
|
|
origin, dstOrigin, size, entire);
|
|
|
|
// Add pinned memory for a later release
|
|
gpu().addPinnedMem(amdMemory);
|
|
}
|
|
else {
|
|
// Check if runtime has to pin a big allocation and
|
|
// release all pinned memory
|
|
if (pinSize > dev().settings().pinnedXferSize_) {
|
|
gpu().releasePinnedMem();
|
|
}
|
|
result = DmaBlitManager::readBuffer(
|
|
srcMemory, dstHost, origin, size, entire);
|
|
}
|
|
}
|
|
|
|
synchronize();
|
|
|
|
return result;
|
|
}
|
|
|
|
bool
|
|
KernelBlitManager::readBufferRect(
|
|
device::Memory& srcMemory,
|
|
void* dstHost,
|
|
const amd::BufferRect& bufRect,
|
|
const amd::BufferRect& hostRect,
|
|
const amd::Coord3D& size,
|
|
bool entire) const
|
|
{
|
|
amd::ScopedLock k(lockXferOps_);
|
|
bool result = false;
|
|
|
|
// Use host copy if memory has direct access
|
|
if (setup_.disableReadBufferRect_ ||
|
|
(gpuMem(srcMemory).isHostMemDirectAccess() && gpuMem(srcMemory).isCacheable())) {
|
|
result = HostBlitManager::readBufferRect(
|
|
srcMemory, dstHost, bufRect, hostRect, size, entire);
|
|
synchronize();
|
|
return result;
|
|
}
|
|
else {
|
|
size_t pinSize = hostRect.start_ + hostRect.end_;
|
|
size_t partial;
|
|
amd::Memory* amdMemory = pinHostMemory(dstHost, pinSize, partial);
|
|
|
|
if (amdMemory == NULL) {
|
|
// Force SW copy
|
|
result = HostBlitManager::readBufferRect(
|
|
srcMemory, dstHost, bufRect, hostRect, size, entire);
|
|
synchronize();
|
|
return result;
|
|
}
|
|
|
|
// Readjust host mem offset
|
|
amd::BufferRect rect;
|
|
rect.rowPitch_ = hostRect.rowPitch_;
|
|
rect.slicePitch_ = hostRect.slicePitch_;
|
|
rect.start_ = hostRect.start_ + partial;
|
|
rect.end_ = hostRect.end_;
|
|
|
|
// Get device memory for this virtual device
|
|
Memory* dstMemory = dev().getGpuMemory(amdMemory);
|
|
|
|
// Copy image to buffer
|
|
result = copyBufferRect(srcMemory, *dstMemory,
|
|
bufRect, rect, size, entire);
|
|
|
|
// Add pinned memory for a later release
|
|
gpu().addPinnedMem(amdMemory);
|
|
}
|
|
|
|
synchronize();
|
|
|
|
return result;
|
|
}
|
|
|
|
bool
|
|
KernelBlitManager::writeBuffer(
|
|
const void* srcHost,
|
|
device::Memory& dstMemory,
|
|
const amd::Coord3D& origin,
|
|
const amd::Coord3D& size,
|
|
bool entire) const
|
|
{
|
|
amd::ScopedLock k(lockXferOps_);
|
|
bool result = false;
|
|
|
|
// Use host copy if memory has direct access or it's persistent
|
|
if (setup_.disableWriteBuffer_ ||
|
|
gpuMem(dstMemory).isHostMemDirectAccess() ||
|
|
(gpuMem(dstMemory).memoryType() == Resource::Persistent)) {
|
|
result = HostBlitManager::writeBuffer(
|
|
srcHost, dstMemory, origin, size, entire);
|
|
synchronize();
|
|
return result;
|
|
}
|
|
else {
|
|
size_t pinSize = size[0];
|
|
|
|
// Check if a pinned transfer can be executed with a single pin
|
|
if ((pinSize <= dev().settings().pinnedXferSize_) &&
|
|
(pinSize > MinSizeForPinnedTransfer)) {
|
|
size_t partial;
|
|
amd::Memory* amdMemory = pinHostMemory(srcHost, pinSize, partial);
|
|
|
|
if (amdMemory == NULL) {
|
|
// Force SW copy
|
|
result = HostBlitManager::writeBuffer(
|
|
srcHost, dstMemory, origin, size, entire);
|
|
synchronize();
|
|
return result;
|
|
}
|
|
|
|
// Readjust destination offset
|
|
const amd::Coord3D srcOrigin(partial);
|
|
|
|
// Get device memory for this virtual device
|
|
Memory* srcMemory = dev().getGpuMemory(amdMemory);
|
|
|
|
// Copy buffer rect
|
|
result = copyBuffer(*srcMemory, dstMemory,
|
|
srcOrigin, origin, size, entire);
|
|
|
|
// Add pinned memory for a later release
|
|
gpu().addPinnedMem(amdMemory);
|
|
}
|
|
else {
|
|
// Check if runtime has to pin a big allocation and
|
|
// release all pinned memory
|
|
if (pinSize > dev().settings().pinnedXferSize_) {
|
|
gpu().releasePinnedMem();
|
|
}
|
|
result = DmaBlitManager::writeBuffer(
|
|
srcHost, dstMemory, origin, size, entire);
|
|
}
|
|
}
|
|
|
|
synchronize();
|
|
|
|
return result;
|
|
}
|
|
|
|
bool
|
|
KernelBlitManager::writeBufferRect(
|
|
const void* srcHost,
|
|
device::Memory& dstMemory,
|
|
const amd::BufferRect& hostRect,
|
|
const amd::BufferRect& bufRect,
|
|
const amd::Coord3D& size,
|
|
bool entire) const
|
|
{
|
|
amd::ScopedLock k(lockXferOps_);
|
|
bool result = false;
|
|
|
|
// Use host copy if memory has direct access or it's persistent
|
|
if (setup_.disableWriteBufferRect_ ||
|
|
gpuMem(dstMemory).isHostMemDirectAccess() ||
|
|
gpuMem(dstMemory).isPersistentDirectMap()) {
|
|
result = HostBlitManager::writeBufferRect(
|
|
srcHost, dstMemory, hostRect, bufRect, size, entire);
|
|
synchronize();
|
|
return result;
|
|
}
|
|
else {
|
|
size_t pinSize = hostRect.start_ + hostRect.end_;
|
|
size_t partial;
|
|
amd::Memory* amdMemory = pinHostMemory(srcHost, pinSize, partial);
|
|
|
|
if (amdMemory == NULL) {
|
|
// Force SW copy
|
|
result = HostBlitManager::writeBufferRect(
|
|
srcHost, dstMemory, hostRect, bufRect, size, entire);
|
|
synchronize();
|
|
return result;
|
|
}
|
|
|
|
// Readjust destination offset
|
|
const amd::Coord3D srcOrigin(partial);
|
|
|
|
// Get device memory for this virtual device
|
|
Memory* srcMemory = dev().getGpuMemory(amdMemory);
|
|
|
|
// Readjust host mem offset
|
|
amd::BufferRect rect;
|
|
rect.rowPitch_ = hostRect.rowPitch_;
|
|
rect.slicePitch_ = hostRect.slicePitch_;
|
|
rect.start_ = hostRect.start_ + partial;
|
|
rect.end_ = hostRect.end_;
|
|
|
|
// Copy buffer rect
|
|
result = copyBufferRect(*srcMemory, dstMemory,
|
|
rect, bufRect, size, entire);
|
|
|
|
// Add pinned memory for a later release
|
|
gpu().addPinnedMem(amdMemory);
|
|
}
|
|
|
|
synchronize();
|
|
|
|
return result;
|
|
}
|
|
|
|
bool
|
|
KernelBlitManager::fillBuffer(
|
|
device::Memory& memory,
|
|
const void* pattern,
|
|
size_t patternSize,
|
|
const amd::Coord3D& origin,
|
|
const amd::Coord3D& size,
|
|
bool entire
|
|
) const
|
|
{
|
|
amd::ScopedLock k(lockXferOps_);
|
|
bool result = false;
|
|
|
|
// Use host fill if memory has direct access
|
|
if (setup_.disableFillBuffer_ ||
|
|
gpuMem(memory).isHostMemDirectAccess()) {
|
|
result = HostBlitManager::fillBuffer(
|
|
memory, pattern, patternSize, origin, size, entire);
|
|
synchronize();
|
|
return result;
|
|
}
|
|
else {
|
|
uint fillType = FillBuffer;
|
|
size_t globalWorkOffset[3] = { 0, 0, 0 };
|
|
cl_int fillSize = size[0] / patternSize;
|
|
size_t globalWorkSize = amd::alignUp(fillSize, 256);
|
|
size_t localWorkSize = 256;
|
|
bool dwordAligned =
|
|
((patternSize % sizeof(uint32_t)) == 0) ? true : false;
|
|
|
|
// Program kernels arguments for the fill operation
|
|
Memory* mem = &gpuMem(memory);
|
|
if (dwordAligned) {
|
|
setArgument(kernels_[fillType], 0, sizeof(cl_mem), NULL);
|
|
setArgument(kernels_[fillType], 1, sizeof(cl_mem), &mem);
|
|
}
|
|
else {
|
|
setArgument(kernels_[fillType], 0, sizeof(cl_mem), &mem);
|
|
setArgument(kernels_[fillType], 1, sizeof(cl_mem), NULL);
|
|
}
|
|
Memory* gpuCB = dev().getGpuMemory(constantBuffer_);
|
|
if (gpuCB == NULL) {
|
|
return false;
|
|
}
|
|
void* constBuf = gpuCB->map(&gpu(), Resource::WriteOnly);
|
|
memcpy(constBuf, pattern, patternSize);
|
|
gpuCB->unmap(&gpu());
|
|
setArgument(kernels_[fillType], 2, sizeof(cl_mem), &gpuCB);
|
|
cl_int offset = origin[0];
|
|
if (dwordAligned) {
|
|
patternSize /= sizeof(uint32_t);
|
|
offset /= sizeof(uint32_t);
|
|
}
|
|
setArgument(kernels_[fillType], 3, sizeof(cl_uint), &patternSize);
|
|
setArgument(kernels_[fillType], 4, sizeof(offset), &offset);
|
|
setArgument(kernels_[fillType], 5, sizeof(fillSize), &fillSize);
|
|
|
|
// Create ND range object for the kernel's execution
|
|
amd::NDRangeContainer ndrange(1,
|
|
globalWorkOffset, &globalWorkSize, &localWorkSize);
|
|
|
|
// Execute the blit
|
|
address parameters = kernels_[fillType]->parameters().values();
|
|
result = gpu().submitKernelInternal(ndrange, *kernels_[fillType], parameters);
|
|
}
|
|
|
|
synchronize();
|
|
|
|
return result;
|
|
}
|
|
|
|
bool
|
|
KernelBlitManager::copyBuffer(
|
|
device::Memory& srcMemory,
|
|
device::Memory& dstMemory,
|
|
const amd::Coord3D& srcOrigin,
|
|
const amd::Coord3D& dstOrigin,
|
|
const amd::Coord3D& sizeIn,
|
|
bool entire) const
|
|
{
|
|
amd::ScopedLock k(lockXferOps_);
|
|
bool result = false;
|
|
bool forceCal = !dev().heap()->isVirtual() &&
|
|
((gpuMem(srcMemory).hb() == NULL) || (gpuMem(dstMemory).hb() == NULL));
|
|
|
|
if ((!forceCal && !gpuMem(srcMemory).isHostMemDirectAccess() &&
|
|
!gpuMem(dstMemory).isHostMemDirectAccess())) {
|
|
uint blitType = BlitCopyBuffer;
|
|
size_t dim = 1;
|
|
size_t globalWorkOffset[3] = { 0, 0, 0 };
|
|
size_t globalWorkSize = 0;
|
|
size_t localWorkSize = 0;
|
|
|
|
const static uint CopyBuffAlignment[3] = { 16, 4, 1 };
|
|
amd::Coord3D size(sizeIn[0], sizeIn[1], sizeIn[2]);
|
|
|
|
bool aligned;
|
|
uint i;
|
|
for (i = 0; i < sizeof(CopyBuffAlignment) / sizeof(uint); i++) {
|
|
// Check source alignments
|
|
aligned = ((srcOrigin[0] % CopyBuffAlignment[i]) == 0);
|
|
// Check destination alignments
|
|
aligned &= ((dstOrigin[0] % CopyBuffAlignment[i]) == 0);
|
|
// Check copy size alignment in the first dimension
|
|
aligned &= ((sizeIn[0] % CopyBuffAlignment[i]) == 0);
|
|
|
|
if (aligned) {
|
|
if (CopyBuffAlignment[i] != 1) {
|
|
blitType = BlitCopyBufferAligned;
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
|
|
size.c[0] /= CopyBuffAlignment[i];
|
|
|
|
// Program the dispatch dimensions
|
|
localWorkSize = 256;
|
|
globalWorkSize = amd::alignUp(size[0] , 256);
|
|
|
|
// Program kernels arguments for the blit operation
|
|
Memory* mem = &gpuMem(srcMemory);
|
|
setArgument(kernels_[blitType], 0, sizeof(cl_mem), &mem);
|
|
mem = &gpuMem(dstMemory);
|
|
setArgument(kernels_[blitType], 1, sizeof(cl_mem), &mem);
|
|
// Program source origin
|
|
cl_int srcOffset = srcOrigin[0] / CopyBuffAlignment[i];;
|
|
setArgument(kernels_[blitType], 2, sizeof(srcOffset), &srcOffset);
|
|
|
|
// Program destinaiton origin
|
|
cl_int dstOffset = dstOrigin[0] / CopyBuffAlignment[i];;
|
|
setArgument(kernels_[blitType], 3, sizeof(dstOffset), &dstOffset);
|
|
|
|
cl_int copySize = size[0];
|
|
setArgument(kernels_[blitType], 4, sizeof(copySize), ©Size);
|
|
|
|
if (blitType == BlitCopyBufferAligned) {
|
|
cl_int alignment = CopyBuffAlignment[i];
|
|
setArgument(kernels_[blitType], 5, sizeof(alignment), &alignment);
|
|
}
|
|
|
|
// Create ND range object for the kernel's execution
|
|
amd::NDRangeContainer ndrange(1,
|
|
globalWorkOffset, &globalWorkSize, &localWorkSize);
|
|
|
|
// Execute the blit
|
|
address parameters = kernels_[blitType]->parameters().values();
|
|
result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], parameters);
|
|
}
|
|
else {
|
|
// Copy data with CAL (no VM mode only)
|
|
result = DmaBlitManager::copyBuffer(
|
|
srcMemory, dstMemory, srcOrigin, dstOrigin, sizeIn, entire);
|
|
}
|
|
|
|
synchronize();
|
|
|
|
return result;
|
|
}
|
|
|
|
bool
|
|
KernelBlitManager::fillImage(
|
|
device::Memory& memory,
|
|
const void* pattern,
|
|
const amd::Coord3D& origin,
|
|
const amd::Coord3D& size,
|
|
bool entire
|
|
) const
|
|
{
|
|
amd::ScopedLock k(lockXferOps_);
|
|
bool result = false;
|
|
|
|
// Use host fill if memory has direct access
|
|
if (setup_.disableFillImage_ ||
|
|
gpuMem(memory).isHostMemDirectAccess()) {
|
|
result = HostBlitManager::fillImage(
|
|
memory, pattern, origin, size, entire);
|
|
synchronize();
|
|
return result;
|
|
}
|
|
|
|
uint fillType;
|
|
size_t dim = 0;
|
|
size_t globalWorkOffset[3] = { 0, 0, 0 };
|
|
size_t globalWorkSize[3];
|
|
size_t localWorkSize[3];
|
|
Memory* memView = &gpuMem(memory);
|
|
amd::Image::Format newFormat(gpuMem(memory).owner()->asImage()->getImageFormat());
|
|
|
|
bool swapLayer = (memView->cal()->dimension_ == GSL_MOA_TEXTURE_1D_ARRAY) &&
|
|
!dev().settings().siPlus_;
|
|
|
|
// Program the kernels workload depending on the fill dimensions
|
|
fillType = FillImage;
|
|
dim = 3;
|
|
|
|
void *newpattern = const_cast<void *>(pattern);
|
|
cl_uint4 iFillColor;
|
|
|
|
bool rejected = false;
|
|
bool releaseView = false;
|
|
// For depth, we need to create a view
|
|
if ((memView->cal()->format_ == CM_SURF_FMT_DEPTH32F) ||
|
|
(memView->cal()->format_ == CM_SURF_FMT_RGBA8_SRGB) ||
|
|
(memView->cal()->format_ == CM_SURF_FMT_DEPTH16)) {
|
|
|
|
// Find unsupported data type
|
|
for (uint i = 0; i < RejectedFormatDataTotal; ++i) {
|
|
if (RejectedData[i].clOldType_ == newFormat.image_channel_data_type) {
|
|
newFormat.image_channel_data_type = RejectedData[i].clNewType_;
|
|
rejected = true;
|
|
break;
|
|
}
|
|
}
|
|
|
|
// Below may not be correct. We need to find why unsigned int view doesn't work for DEPTH16.
|
|
if (gpuMem(memory).cal()->format_ == CM_SURF_FMT_DEPTH16) {
|
|
newFormat.image_channel_data_type = CL_UNORM_INT16;
|
|
}
|
|
|
|
if (gpuMem(memory).cal()->format_ == CM_SURF_FMT_RGBA8_SRGB) {
|
|
// Converting a linear RGB floating-point color value to a 8-bit unsigned integer sRGB value because hw is not support write_imagef for sRGB.
|
|
float *fColor = static_cast<float *>(newpattern);
|
|
iFillColor.s[0] = sRGBmap(fColor[0]);
|
|
iFillColor.s[1] = sRGBmap(fColor[1]);
|
|
iFillColor.s[2] = sRGBmap(fColor[2]);
|
|
iFillColor.s[3] = (cl_uint)(fColor[3]*255.0f);
|
|
newpattern = static_cast<void*>(&iFillColor);
|
|
for (uint i = 0; i < RejectedFormatChannelTotal; ++i) {
|
|
if (RejectedOrder[i].clOldType_ == newFormat.image_channel_order) {
|
|
newFormat.image_channel_order = RejectedOrder[i].clNewType_;
|
|
rejected = true;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
// If the image format was rejected, then attempt to create a view
|
|
if (rejected) {
|
|
memView = createView(gpuMem(memory), dev().getCalFormat(newFormat));
|
|
if (memView != NULL) {
|
|
rejected = false;
|
|
releaseView = true;
|
|
}
|
|
}
|
|
|
|
// Find the current blit type
|
|
if (memView->cal()->dimSize_ == 1) {
|
|
globalWorkSize[0] = amd::alignUp(size[0], 256);
|
|
globalWorkSize[1] = amd::alignUp(size[1], 1);
|
|
globalWorkSize[2] = amd::alignUp(size[2], 1);
|
|
localWorkSize[0] = 256;
|
|
localWorkSize[1] = localWorkSize[2] = 1;
|
|
}
|
|
else if (memView->cal()->dimSize_ == 2) {
|
|
globalWorkSize[0] = amd::alignUp(size[0], 16);
|
|
globalWorkSize[1] = amd::alignUp(size[1], 16);
|
|
globalWorkSize[2] = amd::alignUp(size[2], 1);
|
|
localWorkSize[0] = localWorkSize[1] = 16;
|
|
localWorkSize[2] = 1;
|
|
// Swap the Y and Z components, apparently HW expects
|
|
// layer in Z
|
|
if (swapLayer) {
|
|
globalWorkSize[2] = globalWorkSize[1];
|
|
globalWorkSize[1] = 1;
|
|
localWorkSize[2] = localWorkSize[1];
|
|
localWorkSize[1] = 1;
|
|
}
|
|
}
|
|
else {
|
|
globalWorkSize[0] = amd::alignUp(size[0], 8);
|
|
globalWorkSize[1] = amd::alignUp(size[1], 8);
|
|
globalWorkSize[2] = amd::alignUp(size[2], 4);
|
|
localWorkSize[0] = localWorkSize[1] = 8;
|
|
localWorkSize[2] = 4;
|
|
}
|
|
|
|
// Program kernels arguments for the blit operation
|
|
Memory* mem = memView;
|
|
setArgument(kernels_[fillType], 0, sizeof(cl_mem), &mem);
|
|
setArgument(kernels_[fillType], 1, sizeof(cl_float4), newpattern);
|
|
setArgument(kernels_[fillType], 2, sizeof(cl_int4), newpattern);
|
|
setArgument(kernels_[fillType], 3, sizeof(cl_uint4), newpattern);
|
|
|
|
cl_int fillOrigin[4] = { (cl_int)origin[0],
|
|
(cl_int)origin[1],
|
|
(cl_int)origin[2], 0 };
|
|
cl_int fillSize[4] = { (cl_int)size[0],
|
|
(cl_int)size[1],
|
|
(cl_int)size[2], 0 };
|
|
if (swapLayer) {
|
|
fillOrigin[2] = fillOrigin[1];
|
|
fillOrigin[1] = 0;
|
|
fillSize[2] = fillSize[1];
|
|
fillSize[1] = 1;
|
|
}
|
|
setArgument(kernels_[fillType], 4, sizeof(fillOrigin), fillOrigin);
|
|
setArgument(kernels_[fillType], 5, sizeof(fillSize), fillSize);
|
|
|
|
// Find the type of image
|
|
uint32_t type = 0;
|
|
switch (newFormat.image_channel_data_type) {
|
|
case CL_SNORM_INT8:
|
|
case CL_SNORM_INT16:
|
|
case CL_UNORM_INT8:
|
|
case CL_UNORM_INT16:
|
|
case CL_UNORM_SHORT_565:
|
|
case CL_UNORM_SHORT_555:
|
|
case CL_UNORM_INT_101010:
|
|
case CL_HALF_FLOAT:
|
|
case CL_FLOAT:
|
|
type = 0;
|
|
break;
|
|
case CL_SIGNED_INT8:
|
|
case CL_SIGNED_INT16:
|
|
case CL_SIGNED_INT32:
|
|
type = 1;
|
|
break;
|
|
case CL_UNSIGNED_INT8:
|
|
case CL_UNSIGNED_INT16:
|
|
case CL_UNSIGNED_INT32:
|
|
type = 2;
|
|
break;
|
|
}
|
|
setArgument(kernels_[fillType], 6, sizeof(type), &type);
|
|
|
|
// Create ND range object for the kernel's execution
|
|
amd::NDRangeContainer ndrange(dim,
|
|
globalWorkOffset, globalWorkSize, localWorkSize);
|
|
|
|
// Execute the blit
|
|
address parameters = kernels_[fillType]->parameters().values();
|
|
result = gpu().submitKernelInternal(ndrange, *kernels_[fillType], parameters);
|
|
if (releaseView) {
|
|
delete memView;
|
|
}
|
|
|
|
synchronize();
|
|
|
|
return result;
|
|
}
|
|
|
|
bool
|
|
KernelBlitManager::runScheduler(
|
|
device::Memory& vqueue,
|
|
device::Memory& params,
|
|
uint paramIdx,
|
|
uint numSlots
|
|
) const
|
|
{
|
|
amd::ScopedLock k(lockXferOps_);
|
|
bool result = false;
|
|
|
|
size_t dim = 1;
|
|
size_t globalWorkOffset[1] = { 0 };
|
|
size_t globalWorkSize[1] = { numSlots / 32 };
|
|
size_t localWorkSize[1] = { 1 };
|
|
|
|
// Program kernels arguments
|
|
Memory* q = &gpuMem(vqueue);
|
|
Memory* p = &gpuMem(params);
|
|
setArgument(kernels_[Scheduler], 0, sizeof(cl_mem), &q);
|
|
setArgument(kernels_[Scheduler], 1, sizeof(cl_mem), &p);
|
|
setArgument(kernels_[Scheduler], 2, sizeof(uint), ¶mIdx);
|
|
|
|
// Create ND range object for the kernel's execution
|
|
amd::NDRangeContainer ndrange(1,
|
|
globalWorkOffset, globalWorkSize, localWorkSize);
|
|
|
|
// Execute the blit
|
|
address parameters = kernels_[Scheduler]->parameters().values();
|
|
result = gpu().submitKernelInternal(ndrange, *kernels_[Scheduler], parameters);
|
|
|
|
synchronize();
|
|
|
|
return result;
|
|
}
|
|
|
|
amd::Memory*
|
|
KernelBlitManager::pinHostMemory(
|
|
const void* hostMem,
|
|
size_t pinSize,
|
|
size_t& partial) const
|
|
{
|
|
size_t pinAllocSize;
|
|
const static bool SysMem = true;
|
|
amd::Memory* amdMemory;
|
|
|
|
// Allign offset to 4K boundary (Vista/Win7 limitation)
|
|
char* tmpHost = const_cast<char*>(
|
|
amd::alignDown(reinterpret_cast<const char*>(hostMem),
|
|
PinnedMemoryAlignment));
|
|
|
|
// Find the partial size for unaligned copy
|
|
partial = reinterpret_cast<const char*>(hostMem) - tmpHost;
|
|
|
|
// Recalculate pin memory size
|
|
pinAllocSize = amd::alignUp(pinSize + partial, PinnedMemoryAlignment);
|
|
|
|
amdMemory = new(*context_)
|
|
amd::Buffer(*context_, CL_MEM_USE_HOST_PTR, pinAllocSize);
|
|
|
|
if ((amdMemory != NULL) && !amdMemory->create(tmpHost, SysMem)) {
|
|
amdMemory->release();
|
|
return NULL;
|
|
}
|
|
|
|
// Get device memory for this virtual device
|
|
// @note: This will force real memory pinning
|
|
amdMemory->setVirtualDevice(&gpu());
|
|
Memory* srcMemory = dev().getGpuMemory(amdMemory);
|
|
|
|
if (srcMemory == NULL) {
|
|
// Release all pinned memory and attempt pinning again
|
|
gpu().releasePinnedMem();
|
|
srcMemory = dev().getGpuMemory(amdMemory);
|
|
if (srcMemory == NULL) {
|
|
// Release memory
|
|
amdMemory->release();
|
|
amdMemory = NULL;
|
|
}
|
|
}
|
|
|
|
return amdMemory;
|
|
}
|
|
|
|
Memory*
|
|
KernelBlitManager::createView(
|
|
const Memory& parent,
|
|
const CalFormat& format
|
|
) const
|
|
{
|
|
assert(!parent.cal()->buffer_ && "View supports images only");
|
|
gpu::Memory* gpuImage = NULL;
|
|
|
|
gpuImage = new gpu::Image(dev(), parent.size(),
|
|
parent.cal()->width_,
|
|
parent.cal()->height_,
|
|
parent.cal()->depth_,
|
|
format.type_,
|
|
format.channelOrder_,
|
|
parent.cal()->imageType_);
|
|
|
|
// Create resource
|
|
if (NULL != gpuImage) {
|
|
bool result = false;
|
|
Resource::ImageViewParams params;
|
|
const Memory& gpuMem = static_cast<const Memory&>(parent);
|
|
|
|
params.owner_ = parent.owner();
|
|
params.level_ = 0;
|
|
params.layer_ = 0;
|
|
params.resource_ = &gpuMem;
|
|
params.memory_ = &gpuMem;
|
|
params.gpu_ = &gpu();
|
|
|
|
// Create memory object
|
|
result = gpuImage->create(Resource::ImageView, ¶ms);
|
|
if (!result) {
|
|
delete gpuImage;
|
|
return NULL;
|
|
}
|
|
}
|
|
|
|
return gpuImage;
|
|
}
|
|
|
|
} // namespace gpu
|