Files
rocm-systems/rocclr/runtime/device/gpu/gpublit.cpp
T
foreman d2b905f18e P4 to Git Change 1057998 by gandryey@gera-dev-w7 on 2014/07/22 17:15:58
ECR #304775 - Device enqueuing
	- Use atomic fetch for enqueue flags
	- Switch to a multithreaded scheduler
	- Add a workaround for Linux host_multi_queue failures. Linux has only 2 queues, but the test allocates multiple host queues and the same HW ring can be used

Affected files ...

... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpublit.cpp#106 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudevice.cpp#449 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudevice.hpp#127 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuschedcl.cpp#22 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuvirtual.cpp#325 edit
2014-07-22 17:30:56 -04:00

2848 lines
95 KiB
C++

//
// Copyright (c) 2010 Advanced Micro Devices, Inc. All rights reserved.
//
#include "platform/commandqueue.hpp"
#include "device/gpu/gpudevice.hpp"
#include "device/gpu/gpublit.hpp"
#include "device/gpu/gpumemory.hpp"
#include "device/gpu/gpuvirtual.hpp"
#include "utils/debug.hpp"
namespace gpu {
DmaBlitManager::DmaBlitManager(VirtualGPU& gpu, Setup setup)
: HostBlitManager(gpu, setup)
, MinSizeForPinnedTransfer(dev().settings().pinnedMinXferSize_)
, completeOperation_(false)
{
}
inline void
DmaBlitManager::synchronize() const
{
if (syncOperation_) {
gpu().waitAllEngines();
gpu().releaseMemObjects();
}
}
inline Memory&
DmaBlitManager::gpuMem(device::Memory& mem) const
{
return static_cast<Memory&>(mem);
}
bool
DmaBlitManager::readMemoryStaged(
Resource& srcMemory,
void* dstHost,
Resource** xferBuf,
size_t origin,
size_t& offset,
size_t& totalSize,
size_t xferSize) const
{
amd::Coord3D dst(0, 0, 0);
size_t tmpSize;
uint idxWrite = 0;
uint idxRead = 0;
size_t chunkSize;
static const bool CopyRect = false;
// Flush DMA for ASYNC copy
static const bool FlushDMA = true;
if (dev().xferRead().bufSize() < 128 * Ki) {
chunkSize = dev().xferRead().bufSize();
}
else {
chunkSize = std::min(amd::alignUp(xferSize / 4, 256),
dev().xferRead().bufSize());
chunkSize = std::max(chunkSize, 128 * Ki);
}
// Find the partial transfer size
tmpSize = std::min(chunkSize, xferSize);
amd::Coord3D srcLast(origin + offset, 0, 0);
amd::Coord3D copySizeLast(tmpSize, 0, 0);
// Copy data into the temporary surface
if (!srcMemory.partialMemCopyTo(gpu(), srcLast, dst, copySizeLast,
*xferBuf[idxWrite], CopyRect, FlushDMA)) {
return false;
}
totalSize -= tmpSize;
xferSize -= tmpSize;
offset += tmpSize;
while (xferSize != 0) {
// Find the partial transfer size
tmpSize = std::min(chunkSize, xferSize);
amd::Coord3D src(origin + offset, 0, 0);
amd::Coord3D copySize(tmpSize, 0, 0);
idxWrite = (idxWrite + 1) % 2;
// Copy data into the temporary surface
if (!srcMemory.partialMemCopyTo(gpu(), src, dst, copySize,
*xferBuf[idxWrite], CopyRect, FlushDMA)) {
return false;
}
// Read previous buffer
if (!xferBuf[idxRead]->hostRead(&gpu(),
reinterpret_cast<char*>(dstHost) + offset - copySizeLast[0],
dst, copySizeLast)) {
return false;
}
idxRead = (idxRead + 1) % 2;
copySizeLast = copySize;
totalSize -= tmpSize;
xferSize -= tmpSize;
offset += tmpSize;
}
// Last read
if (!xferBuf[idxRead]->hostRead(&gpu(),
reinterpret_cast<char*>(dstHost) + offset - copySizeLast[0], dst, copySizeLast)) {
return false;
}
return true;
}
bool
DmaBlitManager::readBuffer(
device::Memory& srcMemory,
void* dstHost,
const amd::Coord3D& origin,
const amd::Coord3D& size,
bool entire) const
{
// Use host copy if memory has direct access
if (setup_.disableReadBuffer_ ||
(gpuMem(srcMemory).isHostMemDirectAccess() && gpuMem(srcMemory).isCacheable())) {
return HostBlitManager::readBuffer(
srcMemory, dstHost, origin, size, entire);
}
else {
size_t srcSize = size[0];
size_t offset = 0;
size_t pinSize = dev().settings().pinnedXferSize_;
pinSize = std::min(pinSize, srcSize);
// Check if a pinned transfer can be executed
if (pinSize && (srcSize > MinSizeForPinnedTransfer)) {
// Allign offset to 4K boundary (Vista/Win7 limitation)
char* tmpHost = const_cast<char*>(
amd::alignDown(reinterpret_cast<const char*>(dstHost),
PinnedMemoryAlignment));
// Find the partial size for unaligned copy
size_t partial = reinterpret_cast<const char*>(dstHost) - tmpHost;
Resource* pin[MaxPinnedBuffers];
memset(pin, 0, sizeof(Resource*) * MaxPinnedBuffers);
uint pinIdx = 0;
bool first = true;
size_t tmpSize;
size_t pinAllocSize;
// Copy memory, using pinning
while (srcSize > 0) {
// If it's the first iterarion, then readjust the copy size
// to include alignment
if (first) {
pinAllocSize = amd::alignUp(pinSize + partial,
PinnedMemoryAlignment);
tmpSize = std::min(pinAllocSize - partial, srcSize);
first = false;
}
else {
tmpSize = std::min(pinSize, srcSize);
pinAllocSize = amd::alignUp(tmpSize, PinnedMemoryAlignment);
partial = 0;
}
amd::Coord3D dst(partial, 0, 0);
amd::Coord3D srcPin(origin[0] + offset, 0, 0);
amd::HostMemoryReference hostMem(tmpHost);
amd::Coord3D copySizePin(tmpSize, 0, 0);
// Allocate a GPU resource for pinning
pin[pinIdx] = new Resource(
dev(), pinAllocSize / Heap::ElementSize, Heap::ElementType);
if (pin[pinIdx] != NULL) {
Resource::PinnedParams params;
params.owner_ = NULL;
params.gpu_ = &gpu();
params.hostMemRef_ = &hostMem;
params.size_ = pinAllocSize;
// Create memory object
if (pin[pinIdx]->create(Resource::Pinned, &params)) {
if (!gpuMem(srcMemory).partialMemCopyTo(
gpu(), srcPin, dst, copySizePin, *pin[pinIdx])) {
LogWarning("DmaBlitManager::readBuffer failed a pinned copy!");
break;
}
}
else {
LogWarning("DmaBlitManager::readBuffer failed to pin a resource!");
break;
}
pinIdx = (pinIdx + 1) % MaxPinnedBuffers;
delete pin[pinIdx];
pin[pinIdx] = NULL;
}
else {
LogWarning("DmaBlitManager::readBuffer failed to pin a resource!");
break;
}
srcSize -= tmpSize;
offset += tmpSize;
tmpHost = reinterpret_cast<char*>(tmpHost) + tmpSize + partial;
}
for (uint idx = 0; idx < MaxPinnedBuffers; ++idx) {
delete pin[idx];
}
}
if (0 != srcSize) {
Resource& xferBuf0 = dev().xferRead().acquire();
Resource& xferBuf1 = dev().xferRead().acquire();
Resource* xferBuf[2] = { &xferBuf0, &xferBuf1 };
// Read memory using a staged resource
if (!readMemoryStaged(gpuMem(srcMemory), dstHost, xferBuf, origin[0],
offset, srcSize, srcSize)) {
LogError("DmaBlitManager::readBuffer failed!");
return false;
}
dev().xferRead().release(gpu(), xferBuf1);
dev().xferRead().release(gpu(), xferBuf0);
}
}
return true;
}
bool
DmaBlitManager::readBufferRect(
device::Memory& srcMemory,
void* dstHost,
const amd::BufferRect& bufRect,
const amd::BufferRect& hostRect,
const amd::Coord3D& size,
bool entire) const
{
// Use host copy if memory has direct access
if (setup_.disableReadBufferRect_ ||
(gpuMem(srcMemory).isHostMemDirectAccess() && gpuMem(srcMemory).isCacheable())) {
return HostBlitManager::readBufferRect(
srcMemory, dstHost, bufRect, hostRect, size, entire);
}
else {
Resource& xferBuf = dev().xferRead().acquire();
amd::Coord3D dst(0, 0, 0);
size_t tmpSize = 0;
size_t bufOffset;
size_t hostOffset;
size_t srcSize;
for (size_t z = 0; z < size[2]; ++z) {
for (size_t y = 0; y < size[1]; ++y) {
srcSize = size[0];
bufOffset = bufRect.offset(0, y, z);
hostOffset = hostRect.offset(0, y, z);
while (srcSize != 0) {
// Find the partial transfer size
tmpSize = std::min(dev().xferRead().bufSize(), srcSize);
amd::Coord3D src(bufOffset, 0, 0);
amd::Coord3D copySize(tmpSize, 0, 0);
// Copy data into the temporary surface
if (!gpuMem(srcMemory).partialMemCopyTo(
gpu(), src, dst, copySize, xferBuf, true)) {
LogError("DmaBlitManager::readBufferRect failed!");
return false;
}
if (!xferBuf.hostRead(&gpu(),
reinterpret_cast<char*>(dstHost) + hostOffset,
dst, copySize)) {
LogError("DmaBlitManager::readBufferRect failed!");
return false;
}
srcSize -= tmpSize;
bufOffset += tmpSize;
hostOffset += tmpSize;
}
}
}
dev().xferRead().release(gpu(), xferBuf);
}
return true;
}
bool
DmaBlitManager::readImage(
device::Memory& srcMemory,
void* dstHost,
const amd::Coord3D& origin,
const amd::Coord3D& size,
size_t rowPitch,
size_t slicePitch,
bool entire) const
{
if (setup_.disableReadImage_) {
return HostBlitManager::readImage(srcMemory, dstHost,
origin, size, rowPitch, slicePitch, entire);
}
else {
//! @todo Add HW accelerated path
return HostBlitManager::readImage(srcMemory, dstHost,
origin, size, rowPitch, slicePitch, entire);
}
return true;
}
bool
DmaBlitManager::writeMemoryStaged(
const void* srcHost,
Resource& dstMemory,
Resource& xferBuf,
size_t origin,
size_t& offset,
size_t& totalSize,
size_t xferSize) const
{
amd::Coord3D src(0, 0, 0);
size_t tmpSize;
size_t chunkSize;
if (dev().xferRead().bufSize() < 128 * Ki) {
chunkSize = dev().xferRead().bufSize();
}
else {
chunkSize = std::min(amd::alignUp(xferSize / 4, 256),
dev().xferRead().bufSize());
chunkSize = std::max(chunkSize, 128 * Ki);
}
while (xferSize != 0) {
// Find the partial transfer size
tmpSize = std::min(chunkSize, xferSize);
amd::Coord3D dst(origin + offset, 0, 0);
amd::Coord3D copySize(tmpSize, 0, 0);
// Copy data into the temporary buffer, using CPU
if (!xferBuf.hostWrite(&gpu(),
reinterpret_cast<const char*>(srcHost) + offset,
src, copySize, Resource::Discard)) {
return false;
}
// Copy data into the original destination memory
if (!xferBuf.partialMemCopyTo(
gpu(), src, dst, copySize, dstMemory)) {
return false;
}
totalSize -= tmpSize;
offset += tmpSize;
xferSize -= tmpSize;
}
return true;
}
bool
DmaBlitManager::writeBuffer(
const void* srcHost,
device::Memory& dstMemory,
const amd::Coord3D& origin,
const amd::Coord3D& size,
bool entire) const
{
// Use host copy if memory has direct access or it's persistent
if (setup_.disableWriteBuffer_ ||
gpuMem(dstMemory).isHostMemDirectAccess() ||
gpuMem(dstMemory).isPersistentDirectMap()) {
return HostBlitManager::writeBuffer(
srcHost, dstMemory, origin, size, entire);
}
else {
size_t dstSize = size[0];
size_t tmpSize = 0;
size_t offset = 0;
size_t pinSize = dev().settings().pinnedXferSize_;
pinSize = std::min(pinSize, dstSize);
// Check if a pinned transfer can be executed
if (pinSize && (dstSize > MinSizeForPinnedTransfer)) {
// Allign offset to 4K boundary (Vista/Win7 limitation)
char* tmpHost = const_cast<char*>(
amd::alignDown(reinterpret_cast<const char*>(srcHost),
PinnedMemoryAlignment));
// Find the partial size for unaligned copy
size_t partial = reinterpret_cast<const char*>(srcHost) - tmpHost;
Resource* pin[MaxPinnedBuffers];
memset(pin, 0, sizeof(Resource*) * MaxPinnedBuffers);
uint pinIdx = 0;
bool first = true;
size_t tmpSize;
size_t pinAllocSize;
// Copy memory, using pinning
while (dstSize > 0) {
// If it's the first iterarion, then readjust the copy size
// to include alignment
if (first) {
pinAllocSize = amd::alignUp(pinSize + partial,
PinnedMemoryAlignment);
tmpSize = std::min(pinAllocSize - partial, dstSize);
first = false;
}
else {
tmpSize = std::min(pinSize, dstSize);
pinAllocSize = amd::alignUp(tmpSize, PinnedMemoryAlignment);
partial = 0;
}
amd::Coord3D src(partial, 0, 0);
amd::Coord3D dstPin(origin[0] + offset, 0, 0);
amd::HostMemoryReference hostMem(tmpHost);
amd::Coord3D copySizePin(tmpSize, 0, 0);
// Allocate a GPU resource for pinning
pin[pinIdx] = new Resource(
dev(), pinAllocSize / Heap::ElementSize, Heap::ElementType);
if (pin[pinIdx] != NULL) {
Resource::PinnedParams params;
params.owner_ = NULL;
params.gpu_ = &gpu();
params.hostMemRef_ = &hostMem;
params.size_ = pinAllocSize;
// Create memory object
if (pin[pinIdx]->create(Resource::Pinned, &params)) {
if (!pin[pinIdx]->partialMemCopyTo(
gpu(), src, dstPin, copySizePin, gpuMem(dstMemory))) {
LogWarning("DmaBlitManager::writeBuffer failed a pinned copy!");
break;
}
}
else {
LogWarning("DmaBlitManager::writeBuffer failed to pin a resource!");
break;
}
pinIdx = (pinIdx + 1) % MaxPinnedBuffers;
delete pin[pinIdx];
pin[pinIdx] = NULL;
}
else {
LogWarning("DmaBlitManager::writeBuffer failed to pin a resource!");
break;
}
dstSize -= tmpSize;
offset += tmpSize;
tmpHost = reinterpret_cast<char*>(tmpHost) + tmpSize + partial;
}
for (uint idx = 0; idx < MaxPinnedBuffers; ++idx) {
delete pin[idx];
}
}
if (dstSize != 0) {
Resource& xferBuf = dev().xferWrite().acquire();
// Write memory using a staged resource
if (!writeMemoryStaged(srcHost, gpuMem(dstMemory), xferBuf, origin[0],
offset, dstSize, dstSize)) {
LogError("DmaBlitManager::writeBuffer failed!");
return false;
}
gpu().addXferWrite(xferBuf);
}
}
return true;
}
bool
DmaBlitManager::writeBufferRect(
const void* srcHost,
device::Memory& dstMemory,
const amd::BufferRect& hostRect,
const amd::BufferRect& bufRect,
const amd::Coord3D& size,
bool entire) const
{
// Use host copy if memory has direct access or it's persistent
if (setup_.disableWriteBufferRect_ ||
dstMemory.isHostMemDirectAccess() ||
gpuMem(dstMemory).isPersistentDirectMap()) {
return HostBlitManager::writeBufferRect(
srcHost, dstMemory, hostRect, bufRect, size, entire);
}
else {
Resource& xferBuf = dev().xferWrite().acquire();
amd::Coord3D src(0, 0, 0);
size_t tmpSize = 0;
size_t bufOffset;
size_t hostOffset;
size_t dstSize;
for (size_t z = 0; z < size[2]; ++z) {
for (size_t y = 0; y < size[1]; ++y) {
dstSize = size[0];
bufOffset = bufRect.offset(0, y, z);
hostOffset = hostRect.offset(0, y, z);
while (dstSize != 0) {
// Find the partial transfer size
tmpSize = std::min(dev().xferWrite().bufSize(), dstSize);
amd::Coord3D dst(bufOffset, 0, 0);
amd::Coord3D copySize(tmpSize, 0, 0);
// Copy data into the temporary buffer, using CPU
if (!xferBuf.hostWrite(&gpu(),
reinterpret_cast<const char*>(srcHost) + hostOffset,
src, copySize, Resource::Discard)) {
LogError("DmaBlitManager::writeBufferRect failed!");
return false;
}
// Copy data into the original destination memory
if (!xferBuf.partialMemCopyTo(
gpu(), src, dst, copySize, gpuMem(dstMemory))) {
LogError("DmaBlitManager::writeBufferRect failed!");
return false;
}
dstSize -= tmpSize;
bufOffset += tmpSize;
hostOffset += tmpSize;
}
}
}
gpu().addXferWrite(xferBuf);
}
return true;
}
bool
DmaBlitManager::writeImage(
const void* srcHost,
device::Memory& dstMemory,
const amd::Coord3D& origin,
const amd::Coord3D& size,
size_t rowPitch,
size_t slicePitch,
bool entire) const
{
if (setup_.disableWriteImage_) {
return HostBlitManager::writeImage(
srcHost, dstMemory, origin, size, rowPitch, slicePitch, entire);
}
else {
//! @todo Add HW accelerated path
return HostBlitManager::writeImage(
srcHost, dstMemory, origin, size, rowPitch, slicePitch, entire);
}
return true;
}
bool
DmaBlitManager::copyBuffer(
device::Memory& srcMemory,
device::Memory& dstMemory,
const amd::Coord3D& srcOrigin,
const amd::Coord3D& dstOrigin,
const amd::Coord3D& size,
bool entire) const
{
if (setup_.disableCopyBuffer_ ||
(gpuMem(srcMemory).isHostMemDirectAccess() && gpuMem(srcMemory).isCacheable() &&
gpuMem(dstMemory).isHostMemDirectAccess())) {
return HostBlitManager::copyBuffer(
srcMemory, dstMemory, srcOrigin, dstOrigin, size);
}
else {
return gpuMem(srcMemory).partialMemCopyTo(gpu(),
srcOrigin, dstOrigin, size, gpuMem(dstMemory));
}
return true;
}
bool
DmaBlitManager::copyBufferRect(
device::Memory& srcMemory,
device::Memory& dstMemory,
const amd::BufferRect& srcRect,
const amd::BufferRect& dstRect,
const amd::Coord3D& size,
bool entire) const
{
if (setup_.disableCopyBufferRect_ ||
(gpuMem(srcMemory).isHostMemDirectAccess() && gpuMem(srcMemory).isCacheable() &&
gpuMem(dstMemory).isHostMemDirectAccess())) {
return HostBlitManager::copyBufferRect(
srcMemory, dstMemory, srcRect, dstRect, size, entire);
}
else {
size_t srcOffset;
size_t dstOffset;
if (!dev().settings().rectLinearDMA_) {
for (size_t z = 0; z < size[2]; ++z) {
for (size_t y = 0; y < size[1]; ++y) {
srcOffset = srcRect.offset(0, y, z);
dstOffset = dstRect.offset(0, y, z);
amd::Coord3D src(srcOffset, 0, 0);
amd::Coord3D dst(dstOffset, 0, 0);
amd::Coord3D copySize(size[0], 0, 0);
// Copy data
if (!gpuMem(srcMemory).partialMemCopyTo(
gpu(), src, dst, copySize, gpuMem(dstMemory))) {
LogError("copyBufferRect failed!");
return false;
}
}
}
}
else {
srcOffset = srcRect.offset(0, 0, 0);
dstOffset = dstRect.offset(0, 0, 0);
// 19 bit limit in HW in SI and 16 bit limit in CI+(we adjust the ElementSize to 4bytes but the packet still has 14bits)
size_t pitchLimit = dev().settings().ciPlus_ ? 0xFFFF : 0x7FFFF;
if (((srcOffset % 4) != 0) ||
((dstOffset % 4) != 0) ||
((size[0] % 4) != 0) ||
((srcRect.rowPitch_ % 4) != 0) ||
((srcRect.slicePitch_ % 4) != 0) ||
((dstRect.rowPitch_ % 4) != 0) ||
((dstRect.slicePitch_ % 4) != 0) ||
(srcRect.rowPitch_ > pitchLimit) ||
(dstRect.rowPitch_ > pitchLimit) ||
(size[0] > 0x3fff) || // 14 bits limit in HW
(size[1] > 0x3fff) || // 14 bits limit in HW
(size[2] > 0x7ff)) { // 11 bits limit in HW
// Restriction with rectLinearDRMDMA packet
return false;
}
// Copy data
if (!gpuMem(srcMemory).partialMemCopyTo(gpu(),
amd::Coord3D(srcOffset, srcRect.rowPitch_, srcRect.slicePitch_),
amd::Coord3D(dstOffset, dstRect.rowPitch_, dstRect.slicePitch_),
size, gpuMem(dstMemory), true)) {
LogError("copyBufferRect failed!");
return false;
}
}
}
return true;
}
bool
DmaBlitManager::copyImageToBuffer(
device::Memory& srcMemory,
device::Memory& dstMemory,
const amd::Coord3D& srcOrigin,
const amd::Coord3D& dstOrigin,
const amd::Coord3D& size,
bool entire,
size_t rowPitch,
size_t slicePitch) const
{
bool result = false;
if (setup_.disableCopyImageToBuffer_) {
result = HostBlitManager::copyImageToBuffer(srcMemory, dstMemory,
srcOrigin, dstOrigin, size, entire, rowPitch, slicePitch);
}
else {
// Use CAL path for a transfer
result = gpuMem(srcMemory).partialMemCopyTo(
gpu(), srcOrigin, dstOrigin, size, gpuMem(dstMemory));
// Check if a HostBlit transfer is required
if (completeOperation_ && !result) {
result = HostBlitManager::copyImageToBuffer(srcMemory,
dstMemory, srcOrigin, dstOrigin, size, entire, rowPitch, slicePitch);
}
}
return result;
}
bool
DmaBlitManager::copyBufferToImage(
device::Memory& srcMemory,
device::Memory& dstMemory,
const amd::Coord3D& srcOrigin,
const amd::Coord3D& dstOrigin,
const amd::Coord3D& size,
bool entire,
size_t rowPitch,
size_t slicePitch) const
{
bool result = false;
if (setup_.disableCopyBufferToImage_) {
result = HostBlitManager::copyBufferToImage(srcMemory,
dstMemory, srcOrigin, dstOrigin, size, entire, rowPitch, slicePitch);
}
else {
// Use CAL path for a transfer
result = gpuMem(srcMemory).partialMemCopyTo(
gpu(), srcOrigin, dstOrigin, size, gpuMem(dstMemory));
// Check if a HostBlit transfer is required
if (completeOperation_ && !result) {
result = HostBlitManager::copyBufferToImage(srcMemory,
dstMemory, srcOrigin, dstOrigin, size, entire, rowPitch, slicePitch);
}
}
return result;
}
bool
DmaBlitManager::copyImage(
device::Memory& srcMemory,
device::Memory& dstMemory,
const amd::Coord3D& srcOrigin,
const amd::Coord3D& dstOrigin,
const amd::Coord3D& size,
bool entire) const
{
bool result = false;
if (setup_.disableCopyImage_) {
return HostBlitManager::copyImage(srcMemory, dstMemory,
srcOrigin, dstOrigin, size, entire);
}
else {
//! @todo Add HW accelerated path
return HostBlitManager::copyImage(srcMemory, dstMemory,
srcOrigin, dstOrigin, size, entire);
}
return result;
}
KernelBlitManager::KernelBlitManager(
VirtualGPU& gpu, Setup setup)
: DmaBlitManager(gpu, setup)
, program_(NULL)
, context_(NULL)
, constantBuffer_(NULL)
, xferBufferSize_(0)
, lockXferOps_(NULL)
{
for (uint i = 0; i < BlitTotal; ++i) {
kernels_[i] = NULL;
}
for (uint i = 0; i < MaxXferBuffers; ++i) {
xferBuffers_[i] = NULL;
}
completeOperation_ = false;
}
KernelBlitManager::~KernelBlitManager()
{
for (uint i = 0; i < BlitTotal; ++i) {
if (NULL != kernels_[i]) {
kernels_[i]->release();
}
}
if (NULL != program_) {
program_->release();
}
if (NULL != context_) {
// Release a dummy context
context_->release();
}
if (NULL != constantBuffer_) {
constantBuffer_->release();
}
for (uint i = 0; i < MaxXferBuffers; ++i) {
if (NULL != xferBuffers_[i]) {
xferBuffers_[i]->release();
}
}
delete lockXferOps_;
}
bool
KernelBlitManager::create(amd::Device& device)
{
if (!createProgram(static_cast<Device&>(device))) {
return false;
}
return true;
}
bool
KernelBlitManager::createProgram(Device& device)
{
std::vector<amd::Device*> devices;
devices.push_back(&device);
// Save context and program for this device
context_ = device.blitProgram()->context_;
context_->retain();
program_ = device.blitProgram()->program_;
program_->retain();
bool result = false;
do {
// Create kernel objects for all blits
for (uint i = 0; i < BlitTotal; ++i) {
const amd::Symbol* symbol = program_->findSymbol(BlitName[i]);
if (symbol == NULL) {
break;
}
kernels_[i] = new amd::Kernel(*program_, *symbol, BlitName[i]);
if (kernels_[i] == NULL) {
break;
}
// Validate blit kernels for the scratch memory usage (pre SI)
if (!device.validateKernel(*kernels_[i], &gpu())) {
break;
}
}
result = true;
} while(!result);
// Create an internal constant buffer
constantBuffer_ = new (*context_)
amd::Buffer(*context_, CL_MEM_ALLOC_HOST_PTR, 4 * Ki);
if ((constantBuffer_ != NULL) && !constantBuffer_->create(NULL)) {
constantBuffer_->release();
constantBuffer_ = NULL;
return false;
}
else if (constantBuffer_ == NULL) {
return false;
}
// Assign the constant buffer to the current virtual GPU
constantBuffer_->setVirtualDevice(&gpu());
if (dev().settings().xferBufSize_ > 0) {
xferBufferSize_ = dev().settings().xferBufSize_;
for (uint i = 0; i < MaxXferBuffers; ++i) {
// Create internal xfer buffers for image copy optimization
xferBuffers_[i] = new (*context_)
amd::Buffer(*context_, 0, xferBufferSize_);
if ((xferBuffers_[i] != NULL) && !xferBuffers_[i]->create(NULL)) {
xferBuffers_[i]->release();
xferBuffers_[i] = NULL;
return false;
}
else if (xferBuffers_[i] == NULL) {
return false;
}
// Assign the xfer buffer to the current virtual GPU
xferBuffers_[i]->setVirtualDevice(&gpu());
//! @note Workaround for conformance allocation test.
//! Force GPU mem alloc.
//! Unaligned images require xfer optimization,
//! but deferred memory allocation can cause
//! virtual heap fragmentation for big allocations and
//! then fail the following test with 32 bit ISA, because
//! runtime runs out of 4GB space.
dev().getGpuMemory(xferBuffers_[i]);
}
}
lockXferOps_ = new amd::Monitor("Transfer Ops Lock", true);
if (NULL == lockXferOps_) {
return false;
}
return result;
}
// The following data structures will be used for the view creations.
// Some formats has to be converted before a kernel blit operation
struct FormatConvertion {
cl_uint clOldType_;
cl_uint clNewType_;
};
// The list of rejected data formats and corresponding conversion
static const FormatConvertion RejectedData[] =
{
{ CL_UNORM_INT8, CL_UNSIGNED_INT8 },
{ CL_UNORM_INT16, CL_UNSIGNED_INT16 },
{ CL_SNORM_INT8, CL_UNSIGNED_INT8 },
{ CL_SNORM_INT16, CL_UNSIGNED_INT16 },
{ CL_HALF_FLOAT, CL_UNSIGNED_INT16 },
{ CL_FLOAT, CL_UNSIGNED_INT32 },
{ CL_SIGNED_INT8, CL_UNSIGNED_INT8 },
{ CL_SIGNED_INT16, CL_UNSIGNED_INT16 },
{ CL_UNORM_INT_101010, CL_UNSIGNED_INT8 },
{ CL_SIGNED_INT32, CL_UNSIGNED_INT32 }
};
// The list of rejected channel's order and corresponding conversion
static const FormatConvertion RejectedOrder[] =
{
{ CL_A, CL_R },
{ CL_RA, CL_RG },
{ CL_LUMINANCE, CL_R },
{ CL_INTENSITY, CL_R },
{ CL_RGB, CL_RGBA },
{ CL_BGRA, CL_RGBA },
{ CL_ARGB, CL_RGBA },
{ CL_sRGB, CL_RGBA },
{ CL_sRGBx, CL_RGBA },
{ CL_sRGBA, CL_RGBA },
{ CL_sBGRA, CL_RGBA }
};
const uint RejectedFormatDataTotal =
sizeof(RejectedData) / sizeof(FormatConvertion);
const uint RejectedFormatChannelTotal =
sizeof(RejectedOrder) / sizeof(FormatConvertion);
bool
KernelBlitManager::copyBufferToImage(
device::Memory& srcMemory,
device::Memory& dstMemory,
const amd::Coord3D& srcOrigin,
const amd::Coord3D& dstOrigin,
const amd::Coord3D& size,
bool entire,
size_t rowPitch,
size_t slicePitch) const
{
amd::ScopedLock k(lockXferOps_);
bool result = false;
static const bool CopyRect = false;
// Flush DMA for ASYNC copy
static const bool FlushDMA = true;
if (setup_.disableCopyBufferToImage_) {
result = DmaBlitManager::copyBufferToImage(
srcMemory, dstMemory, srcOrigin, dstOrigin, size,
entire, rowPitch, slicePitch);
synchronize();
return result;
}
// Check if buffer is in system memory with direct access
else if (gpuMem(srcMemory).isHostMemDirectAccess() &&
(rowPitch == 0) && (slicePitch == 0)) {
// First attempt to do this all with DMA,
// but there are restriciton with older hardware
if (dev().settings().imageDMA_) {
result = DmaBlitManager::copyBufferToImage(
srcMemory, dstMemory, srcOrigin, dstOrigin, size,
entire, rowPitch, slicePitch);
if (result) {
synchronize();
return result;
}
}
if (!setup_.disableCopyBufferToImageOpt_) {
// Find the overall copy size
size_t copySize = size[0] * size[1] * size[2] * gpuMem(dstMemory).elementSize();
// Check if double copy was requested
if (xferBufferSize_ != 0) {
amd::Coord3D src(srcOrigin);
amd::Coord3D xferSrc(0, 0, 0);
amd::Coord3D dst(dstOrigin);
amd::Coord3D xferRect(size);
// Find transfer size in pixels
size_t xferSizePix = xferBufferSize_ / gpuMem(dstMemory).elementSize();
bool transfer = true;
// Find transfer rectangle
if (xferRect[0] > xferSizePix) {
// The algorithm can't break a line.
// It requires multiple rectangles tracking
transfer = false;
}
else {
xferRect.c[1] = xferSizePix / xferRect[0];
}
// Check if we exceeded the original size boundary in Y
if (xferRect[1] > size[1]) {
xferRect.c[1] = size[1];
xferRect.c[2] = xferSizePix / (xferRect[0] * xferRect[1]);
}
else {
xferRect.c[2] = 1;
}
// Check if we exceeded the original size boundary in Z
if (xferRect[2] > size[2]) {
xferRect.c[2] = size[2];
}
// Make sure size in Y dimension is divided by the rectangle size
if (size[2] > 1) {
while ((size[1] % xferRect[1]) != 0) {
xferRect.c[1]--;
}
}
// Find one step copy size, based on the copy rectange
amd::Coord3D oneStepSize(
xferRect[0] * xferRect[1] * xferRect[2] * gpuMem(dstMemory).elementSize());
// Initialize transfer buffer array
Memory* xferBuf[MaxXferBuffers];
for (uint i = 0; i < MaxXferBuffers; ++i) {
xferBuf[i] = dev().getGpuMemory(xferBuffers_[i]);
if (xferBuf[i] == NULL) {
transfer = false;
break;
}
}
// Loop until we transfer all data
while (transfer && (copySize > 0)) {
size_t copySizeTmp = copySize;
amd::Coord3D srcTmp(src);
amd::Coord3D oneStepSizeTmp(oneStepSize);
// Step 1. Initiate DRM transfer with all staging buffers
for (uint i = 0; i < MaxXferBuffers; ++i) {
// Make sure we don't transfer more than copy size
if (copySizeTmp > 0) {
if (!gpuMem(srcMemory).partialMemCopyTo(gpu(), srcTmp,
xferSrc, oneStepSizeTmp, *xferBuf[i], CopyRect, FlushDMA)) {
transfer = false;
break;
}
copySizeTmp -= oneStepSizeTmp[0];
// Change buffer offset
srcTmp.c[0] += oneStepSizeTmp[0];
if (copySizeTmp < oneStepSizeTmp[0]) {
oneStepSizeTmp.c[0] = copySizeTmp;
}
}
else {
break;
}
}
// Step 2. Initiate compute transfer with all staging buffers
for (uint i = 0; i < MaxXferBuffers; ++i) {
if (copySize > 0) {
if (!copyBufferToImageKernel(
*xferBuf[i], dstMemory,
xferSrc, dst, xferRect, false)) {
transfer = false;
break;
}
gpu().flushDMA(MainEngine);
copySize -= oneStepSize[0];
// Change buffer offset
src.c[0] += oneStepSize[0];
// Change image offset, ignore X offset
for (uint j = 1; j < 3; ++j) {
dst.c[j] += xferRect[j];
if ((dst[j] - dstOrigin[j]) >= size[j]) {
dst.c[j] = dstOrigin[j];
}
else {
break;
}
}
// Recalculate rectangle size if the remain data is smaller
if (copySize < oneStepSize[0]) {
for (uint j = 0; j < 3; ++j) {
xferRect.c[j] = size[j] - (dst[j] - dstOrigin[j]);
}
oneStepSize.c[0] = copySize;
}
}
else {
break;
}
}
}
if (copySize == 0) {
result = true;
}
else {
LogWarning("2 step transfer in copyBufferToImage failed");
}
}
}
}
if (!result) {
result = copyBufferToImageKernel(srcMemory,
dstMemory, srcOrigin, dstOrigin, size, entire, rowPitch, slicePitch);
}
synchronize();
return result;
}
void
CalcRowSlicePitches(
cl_int* pitch, const cl_int* copySize,
size_t rowPitch, size_t slicePitch, const Memory& mem)
{
size_t memFmtSize = memoryFormatSize(mem.cal()->format_).size_;
bool img1Darray = (mem.cal()->dimension_ == GSL_MOA_TEXTURE_1D_ARRAY) ? true : false;
if (rowPitch == 0) {
pitch[0] = copySize[0];
}
else {
pitch[0] = rowPitch / memFmtSize;
}
if (slicePitch == 0) {
pitch[1] = pitch[0] * (img1Darray ? 1 : copySize[1]);
}
else {
pitch[1] = slicePitch / memFmtSize;
}
assert((pitch[0] <= pitch[1]) && "rowPitch must be <= slicePitch");
if (img1Darray) {
// For 1D array rowRitch = slicePitch
pitch[0] = pitch[1];
}
}
static void
setArgument(amd::Kernel* kernel, size_t index, size_t size, const void* value)
{
const amd::KernelParameterDescriptor& desc = kernel->signature().at(index);
void* param = kernel->parameters().values() + desc.offset_;
assert((desc.type_ == T_POINTER || value != NULL || desc.size_ == 0) &&
"not a valid local mem arg");
uint32_t uint32_value = 0;
uint64_t uint64_value = 0;
if (desc.type_ == T_POINTER && desc.size_ != 0) {
if ((value == NULL) || (static_cast<const cl_mem*>(value) == NULL)) {
LP64_SWITCH(uint32_value, uint64_value) = 0;
}
else {
// convert cl_mem to amd::Memory*, return false if invalid.
LP64_SWITCH(uint32_value, uint64_value) =
(uintptr_t)(*static_cast<Memory* const *>(value));
}
}
else if (desc.type_ == T_SAMPLER) {
assert(false && "No sampler support in blit manager! Use internal samplers!");
}
else switch (desc.size_) {
case 1: uint32_value = *static_cast<const uint8_t*>(value); break;
case 2: uint32_value = *static_cast<const uint16_t*>(value); break;
case 4: uint32_value = *static_cast<const uint32_t*>(value); break;
case 8: uint64_value = *static_cast<const uint64_t*>(value); break;
default: break;
}
switch (desc.size_) {
case 0 /*local mem*/ : *static_cast<size_t*>(param) = size; break;
case sizeof(uint32_t): *static_cast<uint32_t*>(param) = uint32_value; break;
case sizeof(uint64_t): *static_cast<uint64_t*>(param) = uint64_value; break;
default: ::memcpy(param, value, size); break;
}
}
bool
KernelBlitManager::copyBufferToImageKernel(
device::Memory& srcMemory,
device::Memory& dstMemory,
const amd::Coord3D& srcOrigin,
const amd::Coord3D& dstOrigin,
const amd::Coord3D& size,
bool entire,
size_t rowPitch,
size_t slicePitch) const
{
bool rejected = false;
Memory* dstView = &gpuMem(dstMemory);
bool releaseView = false;
bool result = false;
CalFormat imgFormat;
imgFormat.channelOrder_ = gpuMem(dstMemory).cal()->channelOrder_;
imgFormat.type_ = gpuMem(dstMemory).cal()->format_;
amd::Image::Format newFormat(dev().getOclFormat(imgFormat));
// Find unsupported formats
for (uint i = 0; i < RejectedFormatDataTotal; ++i) {
if (RejectedData[i].clOldType_ == newFormat.image_channel_data_type) {
newFormat.image_channel_data_type = RejectedData[i].clNewType_;
rejected = true;
break;
}
}
// Find unsupported channel's order
for (uint i = 0; i < RejectedFormatChannelTotal; ++i) {
if (RejectedOrder[i].clOldType_ == newFormat.image_channel_order) {
newFormat.image_channel_order = RejectedOrder[i].clNewType_;
rejected = true;
break;
}
}
// If the image format was rejected, then attempt to create a view
if (rejected) {
dstView = createView(gpuMem(dstMemory), dev().getCalFormat(newFormat));
if (dstView != NULL) {
rejected = false;
releaseView = true;
}
}
// Fall into the host path if the image format was rejected
if (rejected) {
return HostBlitManager::copyBufferToImage(
srcMemory, dstMemory, srcOrigin, dstOrigin, size, entire);
}
// Use a common blit type with three dimensions by default
uint blitType = BlitCopyBufferToImage;
size_t dim = 0;
size_t globalWorkOffset[3] = { 0, 0, 0 };
size_t globalWorkSize[3];
size_t localWorkSize[3];
bool swapLayer = (gpuMem(dstMemory).cal()->dimension_ == GSL_MOA_TEXTURE_1D_ARRAY) &&
!dev().settings().siPlus_;
// Program the kernels workload depending on the blit dimensions
dim = 3;
if (gpuMem(dstMemory).cal()->dimSize_ == 1) {
globalWorkSize[0] = amd::alignUp(size[0], 256);
globalWorkSize[1] = amd::alignUp(size[1], 1);
globalWorkSize[2] = amd::alignUp(size[2], 1);
localWorkSize[0] = 256;
localWorkSize[1] = localWorkSize[2] = 1;
}
else if (gpuMem(dstMemory).cal()->dimSize_ == 2) {
globalWorkSize[0] = amd::alignUp(size[0], 16);
globalWorkSize[1] = amd::alignUp(size[1], 16);
globalWorkSize[2] = amd::alignUp(size[2], 1);
localWorkSize[0] = localWorkSize[1] = 16;
localWorkSize[2] = 1;
// Swap the Y and Z components, apparently HW expects
// layer in Z
if (swapLayer) {
globalWorkSize[2] = globalWorkSize[1];
globalWorkSize[1] = 1;
localWorkSize[2] = localWorkSize[1];
localWorkSize[1] = 1;
}
}
else {
globalWorkSize[0] = amd::alignUp(size[0], 8);
globalWorkSize[1] = amd::alignUp(size[1], 8);
globalWorkSize[2] = amd::alignUp(size[2], 4);
localWorkSize[0] = localWorkSize[1] = 8;
localWorkSize[2] = 4;
}
// Program kernels arguments for the blit operation
Memory* mem = &gpuMem(srcMemory);
setArgument(kernels_[blitType], 0, sizeof(cl_mem), &mem);
mem = dstView;
setArgument(kernels_[blitType], 1, sizeof(cl_mem), &mem);
const MemFormatStruct& memFmt = memoryFormatSize(gpuMem(dstMemory).cal()->format_);
// 1 element granularity for writes by default
cl_int granularity = 1;
if (memFmt.size_ == 2) {
granularity = 2;
}
else if (memFmt.size_ >= 4) {
granularity = 4;
}
CondLog(((srcOrigin[0] % granularity) != 0), "Unaligned offset in blit!");
cl_int srcOrg[4] = { (cl_int)srcOrigin[0] / granularity,
(cl_int)srcOrigin[1],
(cl_int)srcOrigin[2], 0 };
setArgument(kernels_[blitType], 2, sizeof(srcOrg), srcOrg);
cl_int dstOrg[4] = { (cl_int)dstOrigin[0],
(cl_int)dstOrigin[1],
(cl_int)dstOrigin[2], 0 };
cl_int copySize[4] = { (cl_int)size[0],
(cl_int)size[1],
(cl_int)size[2], 0 };
if (swapLayer) {
dstOrg[2] = dstOrg[1];
dstOrg[1] = 0;
copySize[2] = copySize[1];
copySize[1] = 1;
}
setArgument(kernels_[blitType], 3, sizeof(dstOrg), dstOrg);
setArgument(kernels_[blitType], 4, sizeof(copySize), copySize);
// Program memory format
uint multiplier = memFmt.size_ / sizeof(uint32_t);
multiplier = (multiplier == 0) ? 1 : multiplier;
cl_int format[4] = { (cl_int)memFmt.components_,
(cl_int)memFmt.size_ / (cl_int)memFmt.components_,
(cl_int)multiplier, 0 };
setArgument(kernels_[blitType], 5, sizeof(format), format);
// Program row and slice pitches
cl_int pitch[4] = { 0 };
CalcRowSlicePitches(pitch, copySize, rowPitch, slicePitch, gpuMem(dstMemory));
setArgument(kernels_[blitType], 6, sizeof(pitch), pitch);
// Create ND range object for the kernel's execution
amd::NDRangeContainer ndrange(dim,
globalWorkOffset, globalWorkSize, localWorkSize);
// Execute the blit
address parameters = kernels_[blitType]->parameters().values();
result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], parameters);
if (releaseView) {
delete dstView;
}
return result;
}
bool
KernelBlitManager::copyImageToBuffer(
device::Memory& srcMemory,
device::Memory& dstMemory,
const amd::Coord3D& srcOrigin,
const amd::Coord3D& dstOrigin,
const amd::Coord3D& size,
bool entire,
size_t rowPitch,
size_t slicePitch) const
{
amd::ScopedLock k(lockXferOps_);
bool result = false;
static const bool CopyRect = false;
// Flush DMA for ASYNC copy
static const bool FlushDMA = true;
if (setup_.disableCopyImageToBuffer_) {
result = HostBlitManager::copyImageToBuffer(
srcMemory, dstMemory, srcOrigin, dstOrigin,
size, entire, rowPitch, slicePitch);
synchronize();
return result;
}
// Check if buffer is in system memory with direct access
else if (gpuMem(dstMemory).isHostMemDirectAccess() &&
(rowPitch == 0) && (slicePitch == 0)) {
// First attempt to do this all with DMA,
// but there are restriciton with older hardware
if (dev().settings().imageDMA_) {
result = DmaBlitManager::copyImageToBuffer(
srcMemory, dstMemory, srcOrigin, dstOrigin,
size, entire, rowPitch, slicePitch);
if (result) {
synchronize();
return result;
}
}
// Find the overall copy size
size_t copySize = size[0] * size[1] * size[2] * gpuMem(srcMemory).elementSize();
// Check if double copy was requested
if (xferBufferSize_ != 0) {
amd::Coord3D src(srcOrigin);
amd::Coord3D dst(dstOrigin);
amd::Coord3D xferDst(0, 0, 0);
amd::Coord3D xferRect(size);
// Find transfer size in pixels
size_t xferSizePix = xferBufferSize_ / gpuMem(srcMemory).elementSize();
bool transfer = true;
// Find transfer rectangle
if (xferRect[0] > xferSizePix) {
// The algorithm can't break a line.
// It requires multiple rectangles tracking
transfer = false;
}
else {
xferRect.c[1] = xferSizePix / xferRect[0];
}
// Check if we exceeded the original size boundary in Y
if (xferRect[1] > size[1]) {
xferRect.c[1] = size[1];
xferRect.c[2] = xferSizePix / (xferRect[0] * xferRect[1]);
}
else {
xferRect.c[2] = 1;
}
// Check if we exceeded the original size boundary in Z
if (xferRect[2] > size[2]) {
xferRect.c[2] = size[2];
}
// Make sure size in Y dimension is divided by the rectangle size
if (size[2] > 1) {
while ((size[1] % xferRect[1]) != 0) {
xferRect.c[1]--;
}
}
// Find one step copy size, based on the copy rectange
amd::Coord3D oneStepSize(
xferRect[0] * xferRect[1] * xferRect[2] * gpuMem(srcMemory).elementSize());
// Initialize transfer buffer array
Memory* xferBuf[MaxXferBuffers];
for (uint i = 0; i < MaxXferBuffers; ++i) {
xferBuf[i] = dev().getGpuMemory(xferBuffers_[i]);
if (xferBuf[i] == NULL) {
transfer = false;
break;
}
}
// Loop until we transfer all data
while (transfer && (copySize > 0)) {
size_t copySizeTmp = copySize;
amd::Coord3D srcTmp(src);
amd::Coord3D oneStepSizeTmp(oneStepSize);
amd::Coord3D xferRectTmp(xferRect);
// Step 1. Initiate compute transfer with all staging buffers
for (uint i = 0; i < MaxXferBuffers; ++i) {
if (copySizeTmp > 0) {
if (!copyImageToBufferKernel(
srcMemory, *xferBuf[i],
srcTmp, xferDst, xferRectTmp, false)) {
transfer = false;
break;
}
gpu().flushDMA(MainEngine);
copySizeTmp -= oneStepSizeTmp[0];
// Change image offset, ignore X offset
for (uint j = 1; j < 3; ++j) {
srcTmp.c[j] += xferRectTmp[j];
if ((srcTmp[j] - srcOrigin[j]) >= size[j]) {
srcTmp.c[j] = srcOrigin[j];
}
else {
break;
}
}
// Recalculate rectangle size if the remain data is smaller
if (copySizeTmp < oneStepSizeTmp[0]) {
for (uint j = 0; j < 3; ++j) {
xferRectTmp.c[j] = size[j] - (srcTmp[j] - srcOrigin[j]);
}
}
}
else {
break;
}
}
// Step 2. Initiate DRM transfer with all staging buffers
for (uint i = 0; i < MaxXferBuffers; ++i) {
// Make sure we don't transfer more than copy size
if (copySize > 0) {
if (!xferBuf[i]->partialMemCopyTo(gpu(), xferDst, dst,
oneStepSize, gpuMem(dstMemory), CopyRect, FlushDMA)) {
transfer = false;
break;
}
copySize -= oneStepSize[0];
// Change buffer offset
dst.c[0] += oneStepSize[0];
// Change image offset, ignore X offset
for (uint j = 1; j < 3; ++j) {
src.c[j] += xferRect[j];
if ((src[j] - srcOrigin[j]) >= size[j]) {
src.c[j] = srcOrigin[j];
}
else {
break;
}
}
// Recalculate rectangle size if the remain data is smaller
if (copySize < oneStepSize[0]) {
for (uint j = 0; j < 3; ++j) {
xferRect.c[j] = size[j] - (src[j] - srcOrigin[j]);
}
oneStepSize.c[0] = copySize;
}
}
else {
break;
}
}
}
if (copySize == 0) {
result = true;
}
else {
LogWarning("2 step transfer in copyBufferToImage failed");
}
}
}
if (!result) {
result = copyImageToBufferKernel(srcMemory,
dstMemory, srcOrigin, dstOrigin, size, entire, rowPitch, slicePitch);
}
synchronize();
return result;
}
bool
KernelBlitManager::copyImageToBufferKernel(
device::Memory& srcMemory,
device::Memory& dstMemory,
const amd::Coord3D& srcOrigin,
const amd::Coord3D& dstOrigin,
const amd::Coord3D& size,
bool entire,
size_t rowPitch,
size_t slicePitch) const
{
bool rejected = false;
Memory* srcView = &gpuMem(srcMemory);
bool releaseView = false;
bool result = false;
CalFormat imgFormat;
imgFormat.channelOrder_ = gpuMem(srcMemory).cal()->channelOrder_;
imgFormat.type_ = gpuMem(srcMemory).cal()->format_;
amd::Image::Format newFormat(dev().getOclFormat(imgFormat));
// Find unsupported formats
for (uint i = 0; i < RejectedFormatDataTotal; ++i) {
if (RejectedData[i].clOldType_ == newFormat.image_channel_data_type) {
newFormat.image_channel_data_type = RejectedData[i].clNewType_;
rejected = true;
break;
}
}
// Find unsupported channel's order
for (uint i = 0; i < RejectedFormatChannelTotal; ++i) {
if (RejectedOrder[i].clOldType_ == newFormat.image_channel_order) {
newFormat.image_channel_order = RejectedOrder[i].clNewType_;
rejected = true;
break;
}
}
// If the image format was rejected, then attempt to create a view
if (rejected) {
srcView = createView(gpuMem(srcMemory), dev().getCalFormat(newFormat));
if (srcView != NULL) {
rejected = false;
releaseView = true;
}
}
// Fall into the host path if the image format was rejected
if (rejected) {
return HostBlitManager::copyImageToBuffer(
srcMemory, dstMemory, srcOrigin, dstOrigin, size, entire);
}
uint blitType = BlitCopyImageToBuffer;
size_t dim = 0;
size_t globalWorkOffset[3] = { 0, 0, 0 };
size_t globalWorkSize[3];
size_t localWorkSize[3];
bool swapLayer = (gpuMem(srcMemory).cal()->dimension_ == GSL_MOA_TEXTURE_1D_ARRAY) &&
!dev().settings().siPlus_;
// Program the kernels workload depending on the blit dimensions
dim = 3;
// Find the current blit type
if (gpuMem(srcMemory).cal()->dimSize_ == 1) {
globalWorkSize[0] = amd::alignUp(size[0], 256);
globalWorkSize[1] = amd::alignUp(size[1], 1);
globalWorkSize[2] = amd::alignUp(size[2], 1);
localWorkSize[0] = 256;
localWorkSize[1] = localWorkSize[2] = 1;
}
else if (gpuMem(srcMemory).cal()->dimSize_ == 2) {
globalWorkSize[0] = amd::alignUp(size[0], 16);
globalWorkSize[1] = amd::alignUp(size[1], 16);
globalWorkSize[2] = amd::alignUp(size[2], 1);
localWorkSize[0] = localWorkSize[1] = 16;
localWorkSize[2] = 1;
// Swap the Y and Z components, apparently HW expects
// layer in Z
if (swapLayer) {
globalWorkSize[2] = globalWorkSize[1];
globalWorkSize[1] = 1;
localWorkSize[2] = localWorkSize[1];
localWorkSize[1] = 1;
}
}
else {
globalWorkSize[0] = amd::alignUp(size[0], 8);
globalWorkSize[1] = amd::alignUp(size[1], 8);
globalWorkSize[2] = amd::alignUp(size[2], 4);
localWorkSize[0] = localWorkSize[1] = 8;
localWorkSize[2] = 4;
}
// Program kernels arguments for the blit operation
Memory* mem = srcView;
setArgument(kernels_[blitType], 0, sizeof(cl_mem), &mem);
mem = &gpuMem(dstMemory);
setArgument(kernels_[blitType], 1, sizeof(cl_mem), &mem);
// Update extra paramters for USHORT and UBYTE pointers.
// Only then compiler can optimize the kernel to use
// UAV Raw for other writes
setArgument(kernels_[blitType], 2, sizeof(cl_mem), &mem);
setArgument(kernels_[blitType], 3, sizeof(cl_mem), &mem);
cl_int srcOrg[4] = { (cl_int)srcOrigin[0],
(cl_int)srcOrigin[1],
(cl_int)srcOrigin[2], 0 };
cl_int copySize[4] = { (cl_int)size[0],
(cl_int)size[1],
(cl_int)size[2], 0 };
if (swapLayer) {
srcOrg[2] = srcOrg[1];
srcOrg[1] = 0;
copySize[2] = copySize[1];
copySize[1] = 1;
}
setArgument(kernels_[blitType], 4, sizeof(srcOrg), srcOrg);
const MemFormatStruct& memFmt = memoryFormatSize(gpuMem(srcMemory).cal()->format_);
// 1 element granularity for writes by default
cl_int granularity = 1;
if (memFmt.size_ == 2) {
granularity = 2;
}
else if (memFmt.size_ >= 4) {
granularity = 4;
}
CondLog(((dstOrigin[0] % granularity) != 0), "Unaligned offset in blit!");
cl_int dstOrg[4] = { (cl_int)dstOrigin[0] / granularity,
(cl_int)dstOrigin[1],
(cl_int)dstOrigin[2], 0 };
setArgument(kernels_[blitType], 5, sizeof(dstOrg), dstOrg);
setArgument(kernels_[blitType], 6, sizeof(copySize), copySize);
// Program memory format
uint multiplier = memFmt.size_ / sizeof(uint32_t);
multiplier = (multiplier == 0) ? 1 : multiplier;
cl_int format[4] = { (cl_int)memFmt.components_,
(cl_int)memFmt.size_ / (cl_int)memFmt.components_,
(cl_int)multiplier, 0 };
setArgument(kernels_[blitType], 7, sizeof(format), format);
// Program row and slice pitches
cl_int pitch[4] = { 0 };
CalcRowSlicePitches(pitch, copySize, rowPitch, slicePitch, gpuMem(srcMemory));
setArgument(kernels_[blitType], 8, sizeof(pitch), pitch);
// Create ND range object for the kernel's execution
amd::NDRangeContainer ndrange(dim,
globalWorkOffset, globalWorkSize, localWorkSize);
// Execute the blit
address parameters = kernels_[blitType]->parameters().values();
result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], parameters);
if (releaseView) {
delete srcView;
}
return result;
}
bool
KernelBlitManager::copyImage(
device::Memory& srcMemory,
device::Memory& dstMemory,
const amd::Coord3D& srcOrigin,
const amd::Coord3D& dstOrigin,
const amd::Coord3D& size,
bool entire) const
{
amd::ScopedLock k(lockXferOps_);
bool rejected = false;
Memory* srcView = &gpuMem(srcMemory);
Memory* dstView = &gpuMem(dstMemory);
bool releaseView = false;
bool result = false;
CalFormat imgFormat;
imgFormat.channelOrder_ = gpuMem(srcMemory).cal()->channelOrder_;
imgFormat.type_ = gpuMem(srcMemory).cal()->format_;
amd::Image::Format newFormat(dev().getOclFormat(imgFormat));
// Find unsupported formats
for (uint i = 0; i < RejectedFormatDataTotal; ++i) {
if (RejectedData[i].clOldType_ == newFormat.image_channel_data_type) {
newFormat.image_channel_data_type = RejectedData[i].clNewType_;
rejected = true;
break;
}
}
// Search for the rejected channel's order only if the format was rejected
// Note: Image blit is independent from the channel order
if (rejected) {
for (uint i = 0; i < RejectedFormatChannelTotal; ++i) {
if (RejectedOrder[i].clOldType_ == newFormat.image_channel_order) {
newFormat.image_channel_order = RejectedOrder[i].clNewType_;
rejected = true;
break;
}
}
}
// Attempt to create a view if the format was rejected
if (rejected) {
srcView = createView(gpuMem(srcMemory), dev().getCalFormat(newFormat));
if (srcView != NULL) {
dstView = createView(gpuMem(dstMemory), dev().getCalFormat(newFormat));
if (dstView != NULL) {
rejected = false;
releaseView = true;
}
else {
delete srcView;
}
}
}
// Fall into the host path for the entire 2D copy or
// if the image format was rejected
if (rejected) {
result = HostBlitManager::copyImage(srcMemory, dstMemory,
srcOrigin, dstOrigin, size, entire);
synchronize();
return result;
}
uint blitType = BlitCopyImage;
size_t dim = 0;
size_t globalWorkOffset[3] = { 0, 0, 0 };
size_t globalWorkSize[3];
size_t localWorkSize[3];
// Program the kernels workload depending on the blit dimensions
dim = 3;
// Find the current blit type
if ((gpuMem(srcMemory).cal()->dimSize_ == 1) ||
(gpuMem(dstMemory).cal()->dimSize_ == 1)) {
globalWorkSize[0] = amd::alignUp(size[0], 256);
globalWorkSize[1] = amd::alignUp(size[1], 1);
globalWorkSize[2] = amd::alignUp(size[2], 1);
localWorkSize[0] = 256;
localWorkSize[1] = localWorkSize[2] = 1;
}
else if ((gpuMem(srcMemory).cal()->dimSize_ == 2) ||
(gpuMem(dstMemory).cal()->dimSize_ == 2)) {
globalWorkSize[0] = amd::alignUp(size[0], 16);
globalWorkSize[1] = amd::alignUp(size[1], 16);
globalWorkSize[2] = amd::alignUp(size[2], 1);
localWorkSize[0] = localWorkSize[1] = 16;
localWorkSize[2] = 1;
}
else {
globalWorkSize[0] = amd::alignUp(size[0], 8);
globalWorkSize[1] = amd::alignUp(size[1], 8);
globalWorkSize[2] = amd::alignUp(size[2], 4);
localWorkSize[0] = localWorkSize[1] = 8;
localWorkSize[2] = 4;
}
// The current OpenCL spec allows "copy images from a 1D image
// array object to a 1D image array object" only.
if ((gpuMem(srcMemory).cal()->dimension_ == GSL_MOA_TEXTURE_1D_ARRAY) ||
(gpuMem(dstMemory).cal()->dimension_ == GSL_MOA_TEXTURE_1D_ARRAY)) {
blitType = BlitCopyImage1DA;
}
// Program kernels arguments for the blit operation
Memory* mem = srcView;
setArgument(kernels_[blitType], 0, sizeof(cl_mem), &mem);
mem = dstView;
setArgument(kernels_[blitType], 1, sizeof(cl_mem), &mem);
// Program source origin
cl_int srcOrg[4] = { (cl_int)srcOrigin[0],
(cl_int)srcOrigin[1],
(cl_int)srcOrigin[2], 0 };
if ((gpuMem(srcMemory).cal()->dimension_ == GSL_MOA_TEXTURE_1D_ARRAY) &&
!dev().settings().siPlus_) {
srcOrg[3] = 1;
}
setArgument(kernels_[blitType], 2, sizeof(srcOrg), srcOrg);
// Program destinaiton origin
cl_int dstOrg[4] = { (cl_int)dstOrigin[0],
(cl_int)dstOrigin[1],
(cl_int)dstOrigin[2], 0 };
if ((gpuMem(dstMemory).cal()->dimension_ == GSL_MOA_TEXTURE_1D_ARRAY) &&
!dev().settings().siPlus_) {
dstOrg[3] = 1;
}
setArgument(kernels_[blitType], 3, sizeof(dstOrg), dstOrg);
cl_int copySize[4] = { (cl_int)size[0],
(cl_int)size[1],
(cl_int)size[2], 0 };
setArgument(kernels_[blitType], 4, sizeof(copySize), copySize);
// Create ND range object for the kernel's execution
amd::NDRangeContainer ndrange(dim,
globalWorkOffset, globalWorkSize, localWorkSize);
// Execute the blit
address parameters = kernels_[blitType]->parameters().values();
result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], parameters);
if (releaseView) {
delete srcView;
delete dstView;
}
synchronize();
return result;
}
void
FindPinSize(
size_t& pinSize, const amd::Coord3D& size,
size_t& rowPitch, size_t& slicePitch, const Memory& mem)
{
pinSize = size[0] * mem.elementSize();
if ((rowPitch == 0) || (rowPitch == pinSize)) {
rowPitch = 0;
}
else {
pinSize = rowPitch;
}
// Calculate the pin size, which should be equal to the copy size
for (uint i = 1; i < mem.cal()->dimSize_; ++i) {
pinSize *= size[i];
if (i == 1) {
if ((slicePitch == 0) || (slicePitch == pinSize)) {
slicePitch = 0;
}
else {
if (mem.cal()->dimension_ != GSL_MOA_TEXTURE_1D_ARRAY) {
pinSize = slicePitch;
}
else {
pinSize = slicePitch * size[i];
}
}
}
}
}
bool
KernelBlitManager::readImage(
device::Memory& srcMemory,
void* dstHost,
const amd::Coord3D& origin,
const amd::Coord3D& size,
size_t rowPitch,
size_t slicePitch,
bool entire) const
{
amd::ScopedLock k(lockXferOps_);
bool result = false;
// Use host copy if memory has direct access or it's persistent
if (setup_.disableReadImage_ ||
(gpuMem(srcMemory).isHostMemDirectAccess() && gpuMem(srcMemory).isCacheable())) {
result = HostBlitManager::readImage(srcMemory, dstHost,
origin, size, rowPitch, slicePitch, entire);
synchronize();
return result;
}
else {
size_t pinSize;
FindPinSize(pinSize, size, rowPitch, slicePitch, gpuMem(srcMemory));
size_t partial;
amd::Memory* amdMemory = pinHostMemory(dstHost, pinSize, partial);
if (amdMemory == NULL) {
// Force SW copy
result = HostBlitManager::readImage(srcMemory, dstHost,
origin, size, rowPitch, slicePitch, entire);
synchronize();
return result;
}
// Readjust destination offset
const amd::Coord3D dstOrigin(partial);
// Get device memory for this virtual device
Memory* dstMemory = dev().getGpuMemory(amdMemory);
// Copy image to buffer
result = copyImageToBuffer(srcMemory, *dstMemory,
origin, dstOrigin, size, entire, rowPitch, slicePitch);
// Add pinned memory for a later release
gpu().addPinnedMem(amdMemory);
}
synchronize();
return result;
}
bool
KernelBlitManager::writeImage(
const void* srcHost,
device::Memory& dstMemory,
const amd::Coord3D& origin,
const amd::Coord3D& size,
size_t rowPitch,
size_t slicePitch,
bool entire) const
{
amd::ScopedLock k(lockXferOps_);
bool result = false;
// Use host copy if memory has direct access or it's persistent
if (setup_.disableWriteImage_||
gpuMem(dstMemory).isHostMemDirectAccess() ||
gpuMem(dstMemory).isPersistentDirectMap()) {
result = HostBlitManager::writeImage(
srcHost, dstMemory, origin, size, rowPitch, slicePitch, entire);
synchronize();
return result;
}
else {
size_t pinSize;
FindPinSize(pinSize, size, rowPitch, slicePitch, gpuMem(dstMemory));
size_t partial;
amd::Memory* amdMemory = pinHostMemory(srcHost, pinSize, partial);
if (amdMemory == NULL) {
// Force SW copy
result = HostBlitManager::writeImage(
srcHost, dstMemory, origin, size, rowPitch, slicePitch, entire);
synchronize();
return result;
}
// Readjust destination offset
const amd::Coord3D srcOrigin(partial);
// Get device memory for this virtual device
Memory* srcMemory = dev().getGpuMemory(amdMemory);
// Copy image to buffer
result = copyBufferToImage(*srcMemory, dstMemory,
srcOrigin, origin, size, entire, rowPitch, slicePitch);
// Add pinned memory for a later release
gpu().addPinnedMem(amdMemory);
}
synchronize();
return result;
}
bool
KernelBlitManager::copyBufferRect(
device::Memory& srcMemory,
device::Memory& dstMemory,
const amd::BufferRect& srcRectIn,
const amd::BufferRect& dstRectIn,
const amd::Coord3D& sizeIn,
bool entire) const
{
amd::ScopedLock k(lockXferOps_);
bool result = false;
bool rejected = false;
// Fall into the CAL path for rejected transfers
if (setup_.disableCopyBufferRect_ ||
((gpuMem(srcMemory).isHostMemDirectAccess() || gpuMem(dstMemory).isHostMemDirectAccess()) &&
dev().settings().rectLinearDMA_) ||
(!dev().heap()->isVirtual() &&
((gpuMem(dstMemory).hb() == NULL) || (gpuMem(srcMemory).hb() == NULL)))) {
// Copy data with CAL (no VM mode only)
if ((gpuMem(srcMemory).isHostMemDirectAccess() || gpuMem(dstMemory).isHostMemDirectAccess())
&& dev().settings().rectLinearDMA_) {
result = DmaBlitManager::copyBufferRect(srcMemory, dstMemory,
srcRectIn, dstRectIn, sizeIn, entire);
}
if ((!dev().heap()->isVirtual() && ((gpuMem(dstMemory).hb() == NULL) || (gpuMem(srcMemory).hb() == NULL)))
&& !result) {
result = HostBlitManager::copyBufferRect(srcMemory, dstMemory,
srcRectIn, dstRectIn, sizeIn, entire);
}
if (result) {
synchronize();
return result;
}
}
uint blitType = BlitCopyBufferRect;
size_t dim = 3;
size_t globalWorkOffset[3] = { 0, 0, 0 };
size_t globalWorkSize[3];
size_t localWorkSize[3];
const static uint CopyRectAlignment[3] = { 16, 4, 1 };
bool aligned;
uint i;
for (i = 0; i < sizeof(CopyRectAlignment) / sizeof(uint); i++) {
// Check source alignments
aligned = ((srcRectIn.rowPitch_ % CopyRectAlignment[i]) == 0);
aligned &= ((srcRectIn.slicePitch_ % CopyRectAlignment[i]) == 0);
aligned &= ((srcRectIn.start_ % CopyRectAlignment[i]) == 0);
// Check destination alignments
aligned &= ((dstRectIn.rowPitch_ % CopyRectAlignment[i]) == 0);
aligned &= ((dstRectIn.slicePitch_ % CopyRectAlignment[i]) == 0);
aligned &= ((dstRectIn.start_ % CopyRectAlignment[i]) == 0);
// Check copy size alignment in the first dimension
aligned &= ((sizeIn[0] % CopyRectAlignment[i]) == 0);
if (aligned) {
if (CopyRectAlignment[i] != 1) {
blitType = BlitCopyBufferRectAligned;
}
break;
}
}
amd::BufferRect srcRect;
amd::BufferRect dstRect;
amd::Coord3D size(sizeIn[0], sizeIn[1], sizeIn[2]);
srcRect.rowPitch_ = srcRectIn.rowPitch_ / CopyRectAlignment[i];
srcRect.slicePitch_ = srcRectIn.slicePitch_ / CopyRectAlignment[i];
srcRect.start_ = srcRectIn.start_ / CopyRectAlignment[i];
srcRect.end_ = srcRectIn.end_ / CopyRectAlignment[i];
dstRect.rowPitch_ = dstRectIn.rowPitch_ / CopyRectAlignment[i];
dstRect.slicePitch_ = dstRectIn.slicePitch_ / CopyRectAlignment[i];
dstRect.start_ = dstRectIn.start_ / CopyRectAlignment[i];
dstRect.end_ = dstRectIn.end_ / CopyRectAlignment[i];
size.c[0] /= CopyRectAlignment[i];
// Program the kernel's workload depending on the transfer dimensions
if ((size[1] == 1) && (size[2] == 1)) {
globalWorkSize[0] = amd::alignUp(size[0], 256);
globalWorkSize[1] = 1;
globalWorkSize[2] = 1;
localWorkSize[0] = 256;
localWorkSize[1] = 1;
localWorkSize[2] = 1;
}
else if (size[2] == 1) {
globalWorkSize[0] = amd::alignUp(size[0], 16);
globalWorkSize[1] = amd::alignUp(size[1], 16);
globalWorkSize[2] = 1;
localWorkSize[0] = localWorkSize[1] = 16;
localWorkSize[2] = 1;
}
else {
globalWorkSize[0] = amd::alignUp(size[0], 8);
globalWorkSize[1] = amd::alignUp(size[1], 8);
globalWorkSize[2] = amd::alignUp(size[2], 4);
localWorkSize[0] = localWorkSize[1] = 8;
localWorkSize[2] = 4;
}
// Program kernels arguments for the blit operation
Memory* mem = &gpuMem(srcMemory);
setArgument(kernels_[blitType], 0, sizeof(cl_mem), &mem);
mem = &gpuMem(dstMemory);
setArgument(kernels_[blitType], 1, sizeof(cl_mem), &mem);
cl_uint src[4] = { (cl_uint)srcRect.rowPitch_,
(cl_uint)srcRect.slicePitch_,
(cl_uint)srcRect.start_, 0 };
setArgument(kernels_[blitType], 2, sizeof(src), src);
cl_uint dst[4] = { (cl_uint)dstRect.rowPitch_,
(cl_uint)dstRect.slicePitch_,
(cl_uint)dstRect.start_, 0 };
setArgument(kernels_[blitType], 3, sizeof(dst), dst);
cl_int copySize[4] = { (cl_int)size[0],
(cl_int)size[1],
(cl_int)size[2],
(cl_int)CopyRectAlignment[i] };
setArgument(kernels_[blitType], 4, sizeof(copySize), copySize);
// Create ND range object for the kernel's execution
amd::NDRangeContainer ndrange(dim,
globalWorkOffset, globalWorkSize, localWorkSize);
// Execute the blit
address parameters = kernels_[blitType]->parameters().values();
result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], parameters);
synchronize();
return result;
}
bool
KernelBlitManager::readBuffer(
device::Memory& srcMemory,
void* dstHost,
const amd::Coord3D& origin,
const amd::Coord3D& size,
bool entire) const
{
amd::ScopedLock k(lockXferOps_);
bool result = false;
// Use host copy if memory has direct access
if (setup_.disableReadBuffer_ ||
(gpuMem(srcMemory).isHostMemDirectAccess() && gpuMem(srcMemory).isCacheable())) {
result = HostBlitManager::readBuffer(
srcMemory, dstHost, origin, size, entire);
synchronize();
return result;
}
else {
size_t pinSize = size[0];
// Check if a pinned transfer can be executed with a single pin
if ((pinSize <= dev().settings().pinnedXferSize_) &&
(pinSize > MinSizeForPinnedTransfer)) {
size_t partial;
amd::Memory* amdMemory = pinHostMemory(dstHost, pinSize, partial);
if (amdMemory == NULL) {
// Force SW copy
result = HostBlitManager::readBuffer(
srcMemory, dstHost, origin, size, entire);
synchronize();
return result;
}
// Readjust host mem offset
amd::Coord3D dstOrigin(partial);
// Get device memory for this virtual device
Memory* dstMemory = dev().getGpuMemory(amdMemory);
// Copy image to buffer
result = copyBuffer(srcMemory, *dstMemory,
origin, dstOrigin, size, entire);
// Add pinned memory for a later release
gpu().addPinnedMem(amdMemory);
}
else {
// Check if runtime has to pin a big allocation and
// release all pinned memory
if (pinSize > dev().settings().pinnedXferSize_) {
gpu().releasePinnedMem();
}
result = DmaBlitManager::readBuffer(
srcMemory, dstHost, origin, size, entire);
}
}
synchronize();
return result;
}
bool
KernelBlitManager::readBufferRect(
device::Memory& srcMemory,
void* dstHost,
const amd::BufferRect& bufRect,
const amd::BufferRect& hostRect,
const amd::Coord3D& size,
bool entire) const
{
amd::ScopedLock k(lockXferOps_);
bool result = false;
// Use host copy if memory has direct access
if (setup_.disableReadBufferRect_ ||
(gpuMem(srcMemory).isHostMemDirectAccess() && gpuMem(srcMemory).isCacheable())) {
result = HostBlitManager::readBufferRect(
srcMemory, dstHost, bufRect, hostRect, size, entire);
synchronize();
return result;
}
else {
size_t pinSize = hostRect.start_ + hostRect.end_;
size_t partial;
amd::Memory* amdMemory = pinHostMemory(dstHost, pinSize, partial);
if (amdMemory == NULL) {
// Force SW copy
result = HostBlitManager::readBufferRect(
srcMemory, dstHost, bufRect, hostRect, size, entire);
synchronize();
return result;
}
// Readjust host mem offset
amd::BufferRect rect;
rect.rowPitch_ = hostRect.rowPitch_;
rect.slicePitch_ = hostRect.slicePitch_;
rect.start_ = hostRect.start_ + partial;
rect.end_ = hostRect.end_;
// Get device memory for this virtual device
Memory* dstMemory = dev().getGpuMemory(amdMemory);
// Copy image to buffer
result = copyBufferRect(srcMemory, *dstMemory,
bufRect, rect, size, entire);
// Add pinned memory for a later release
gpu().addPinnedMem(amdMemory);
}
synchronize();
return result;
}
bool
KernelBlitManager::writeBuffer(
const void* srcHost,
device::Memory& dstMemory,
const amd::Coord3D& origin,
const amd::Coord3D& size,
bool entire) const
{
amd::ScopedLock k(lockXferOps_);
bool result = false;
// Use host copy if memory has direct access or it's persistent
if (setup_.disableWriteBuffer_ ||
gpuMem(dstMemory).isHostMemDirectAccess() ||
(gpuMem(dstMemory).memoryType() == Resource::Persistent)) {
result = HostBlitManager::writeBuffer(
srcHost, dstMemory, origin, size, entire);
synchronize();
return result;
}
else {
size_t pinSize = size[0];
// Check if a pinned transfer can be executed with a single pin
if ((pinSize <= dev().settings().pinnedXferSize_) &&
(pinSize > MinSizeForPinnedTransfer)) {
size_t partial;
amd::Memory* amdMemory = pinHostMemory(srcHost, pinSize, partial);
if (amdMemory == NULL) {
// Force SW copy
result = HostBlitManager::writeBuffer(
srcHost, dstMemory, origin, size, entire);
synchronize();
return result;
}
// Readjust destination offset
const amd::Coord3D srcOrigin(partial);
// Get device memory for this virtual device
Memory* srcMemory = dev().getGpuMemory(amdMemory);
// Copy buffer rect
result = copyBuffer(*srcMemory, dstMemory,
srcOrigin, origin, size, entire);
// Add pinned memory for a later release
gpu().addPinnedMem(amdMemory);
}
else {
// Check if runtime has to pin a big allocation and
// release all pinned memory
if (pinSize > dev().settings().pinnedXferSize_) {
gpu().releasePinnedMem();
}
result = DmaBlitManager::writeBuffer(
srcHost, dstMemory, origin, size, entire);
}
}
synchronize();
return result;
}
bool
KernelBlitManager::writeBufferRect(
const void* srcHost,
device::Memory& dstMemory,
const amd::BufferRect& hostRect,
const amd::BufferRect& bufRect,
const amd::Coord3D& size,
bool entire) const
{
amd::ScopedLock k(lockXferOps_);
bool result = false;
// Use host copy if memory has direct access or it's persistent
if (setup_.disableWriteBufferRect_ ||
gpuMem(dstMemory).isHostMemDirectAccess() ||
gpuMem(dstMemory).isPersistentDirectMap()) {
result = HostBlitManager::writeBufferRect(
srcHost, dstMemory, hostRect, bufRect, size, entire);
synchronize();
return result;
}
else {
size_t pinSize = hostRect.start_ + hostRect.end_;
size_t partial;
amd::Memory* amdMemory = pinHostMemory(srcHost, pinSize, partial);
if (amdMemory == NULL) {
// Force SW copy
result = HostBlitManager::writeBufferRect(
srcHost, dstMemory, hostRect, bufRect, size, entire);
synchronize();
return result;
}
// Readjust destination offset
const amd::Coord3D srcOrigin(partial);
// Get device memory for this virtual device
Memory* srcMemory = dev().getGpuMemory(amdMemory);
// Readjust host mem offset
amd::BufferRect rect;
rect.rowPitch_ = hostRect.rowPitch_;
rect.slicePitch_ = hostRect.slicePitch_;
rect.start_ = hostRect.start_ + partial;
rect.end_ = hostRect.end_;
// Copy buffer rect
result = copyBufferRect(*srcMemory, dstMemory,
rect, bufRect, size, entire);
// Add pinned memory for a later release
gpu().addPinnedMem(amdMemory);
}
synchronize();
return result;
}
bool
KernelBlitManager::fillBuffer(
device::Memory& memory,
const void* pattern,
size_t patternSize,
const amd::Coord3D& origin,
const amd::Coord3D& size,
bool entire
) const
{
amd::ScopedLock k(lockXferOps_);
bool result = false;
// Use host fill if memory has direct access
if (setup_.disableFillBuffer_ ||
gpuMem(memory).isHostMemDirectAccess()) {
result = HostBlitManager::fillBuffer(
memory, pattern, patternSize, origin, size, entire);
synchronize();
return result;
}
else {
uint fillType = FillBuffer;
size_t globalWorkOffset[3] = { 0, 0, 0 };
cl_int fillSize = size[0] / patternSize;
size_t globalWorkSize = amd::alignUp(fillSize, 256);
size_t localWorkSize = 256;
bool dwordAligned =
((patternSize % sizeof(uint32_t)) == 0) ? true : false;
// Program kernels arguments for the fill operation
Memory* mem = &gpuMem(memory);
if (dwordAligned) {
setArgument(kernels_[fillType], 0, sizeof(cl_mem), NULL);
setArgument(kernels_[fillType], 1, sizeof(cl_mem), &mem);
}
else {
setArgument(kernels_[fillType], 0, sizeof(cl_mem), &mem);
setArgument(kernels_[fillType], 1, sizeof(cl_mem), NULL);
}
Memory* gpuCB = dev().getGpuMemory(constantBuffer_);
if (gpuCB == NULL) {
return false;
}
void* constBuf = gpuCB->map(&gpu(), Resource::WriteOnly);
memcpy(constBuf, pattern, patternSize);
gpuCB->unmap(&gpu());
setArgument(kernels_[fillType], 2, sizeof(cl_mem), &gpuCB);
cl_int offset = origin[0];
if (dwordAligned) {
patternSize /= sizeof(uint32_t);
offset /= sizeof(uint32_t);
}
setArgument(kernels_[fillType], 3, sizeof(cl_uint), &patternSize);
setArgument(kernels_[fillType], 4, sizeof(offset), &offset);
setArgument(kernels_[fillType], 5, sizeof(fillSize), &fillSize);
// Create ND range object for the kernel's execution
amd::NDRangeContainer ndrange(1,
globalWorkOffset, &globalWorkSize, &localWorkSize);
// Execute the blit
address parameters = kernels_[fillType]->parameters().values();
result = gpu().submitKernelInternal(ndrange, *kernels_[fillType], parameters);
}
synchronize();
return result;
}
bool
KernelBlitManager::copyBuffer(
device::Memory& srcMemory,
device::Memory& dstMemory,
const amd::Coord3D& srcOrigin,
const amd::Coord3D& dstOrigin,
const amd::Coord3D& sizeIn,
bool entire) const
{
amd::ScopedLock k(lockXferOps_);
bool result = false;
bool forceCal = !dev().heap()->isVirtual() &&
((gpuMem(srcMemory).hb() == NULL) || (gpuMem(dstMemory).hb() == NULL));
if ((!forceCal && !gpuMem(srcMemory).isHostMemDirectAccess() &&
!gpuMem(dstMemory).isHostMemDirectAccess())) {
uint blitType = BlitCopyBuffer;
size_t dim = 1;
size_t globalWorkOffset[3] = { 0, 0, 0 };
size_t globalWorkSize = 0;
size_t localWorkSize = 0;
const static uint CopyBuffAlignment[3] = { 16, 4, 1 };
amd::Coord3D size(sizeIn[0], sizeIn[1], sizeIn[2]);
bool aligned;
uint i;
for (i = 0; i < sizeof(CopyBuffAlignment) / sizeof(uint); i++) {
// Check source alignments
aligned = ((srcOrigin[0] % CopyBuffAlignment[i]) == 0);
// Check destination alignments
aligned &= ((dstOrigin[0] % CopyBuffAlignment[i]) == 0);
// Check copy size alignment in the first dimension
aligned &= ((sizeIn[0] % CopyBuffAlignment[i]) == 0);
if (aligned) {
if (CopyBuffAlignment[i] != 1) {
blitType = BlitCopyBufferAligned;
}
break;
}
}
size.c[0] /= CopyBuffAlignment[i];
// Program the dispatch dimensions
localWorkSize = 256;
globalWorkSize = amd::alignUp(size[0] , 256);
// Program kernels arguments for the blit operation
Memory* mem = &gpuMem(srcMemory);
setArgument(kernels_[blitType], 0, sizeof(cl_mem), &mem);
mem = &gpuMem(dstMemory);
setArgument(kernels_[blitType], 1, sizeof(cl_mem), &mem);
// Program source origin
cl_int srcOffset = srcOrigin[0] / CopyBuffAlignment[i];;
setArgument(kernels_[blitType], 2, sizeof(srcOffset), &srcOffset);
// Program destinaiton origin
cl_int dstOffset = dstOrigin[0] / CopyBuffAlignment[i];;
setArgument(kernels_[blitType], 3, sizeof(dstOffset), &dstOffset);
cl_int copySize = size[0];
setArgument(kernels_[blitType], 4, sizeof(copySize), &copySize);
if (blitType == BlitCopyBufferAligned) {
cl_int alignment = CopyBuffAlignment[i];
setArgument(kernels_[blitType], 5, sizeof(alignment), &alignment);
}
// Create ND range object for the kernel's execution
amd::NDRangeContainer ndrange(1,
globalWorkOffset, &globalWorkSize, &localWorkSize);
// Execute the blit
address parameters = kernels_[blitType]->parameters().values();
result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], parameters);
}
else {
// Copy data with CAL (no VM mode only)
result = DmaBlitManager::copyBuffer(
srcMemory, dstMemory, srcOrigin, dstOrigin, sizeIn, entire);
}
synchronize();
return result;
}
bool
KernelBlitManager::fillImage(
device::Memory& memory,
const void* pattern,
const amd::Coord3D& origin,
const amd::Coord3D& size,
bool entire
) const
{
amd::ScopedLock k(lockXferOps_);
bool result = false;
// Use host fill if memory has direct access
if (setup_.disableFillImage_ ||
gpuMem(memory).isHostMemDirectAccess()) {
result = HostBlitManager::fillImage(
memory, pattern, origin, size, entire);
synchronize();
return result;
}
uint fillType;
size_t dim = 0;
size_t globalWorkOffset[3] = { 0, 0, 0 };
size_t globalWorkSize[3];
size_t localWorkSize[3];
Memory* memView = &gpuMem(memory);
amd::Image::Format newFormat(gpuMem(memory).owner()->asImage()->getImageFormat());
bool swapLayer = (memView->cal()->dimension_ == GSL_MOA_TEXTURE_1D_ARRAY) &&
!dev().settings().siPlus_;
// Program the kernels workload depending on the fill dimensions
fillType = FillImage;
dim = 3;
bool rejected = false;
bool releaseView = false;
// For depth, we need to create a view
if ((memView->cal()->format_ == CM_SURF_FMT_DEPTH32F) ||
(memView->cal()->format_ == CM_SURF_FMT_RGBA8_SRGB) ||
(memView->cal()->format_ == CM_SURF_FMT_DEPTH16)) {
// Find unsupported data type
for (uint i = 0; i < RejectedFormatDataTotal; ++i) {
if (RejectedData[i].clOldType_ == newFormat.image_channel_data_type) {
newFormat.image_channel_data_type = RejectedData[i].clNewType_;
rejected = true;
break;
}
}
// Below may not be correct. We need to find why unsigned int view doesn't work for DEPTH16.
if (gpuMem(memory).cal()->format_ == CM_SURF_FMT_DEPTH16) {
newFormat.image_channel_data_type = CL_UNORM_INT16;
}
if (gpuMem(memory).cal()->format_ == CM_SURF_FMT_RGBA8_SRGB) {
for (uint i = 0; i < RejectedFormatChannelTotal; ++i) {
if (RejectedOrder[i].clOldType_ == newFormat.image_channel_order) {
newFormat.image_channel_order = RejectedOrder[i].clNewType_;
rejected = true;
break;
}
}
}
}
// If the image format was rejected, then attempt to create a view
if (rejected) {
memView = createView(gpuMem(memory), dev().getCalFormat(newFormat));
if (memView != NULL) {
rejected = false;
releaseView = true;
}
}
// Find the current blit type
if (memView->cal()->dimSize_ == 1) {
globalWorkSize[0] = amd::alignUp(size[0], 256);
globalWorkSize[1] = amd::alignUp(size[1], 1);
globalWorkSize[2] = amd::alignUp(size[2], 1);
localWorkSize[0] = 256;
localWorkSize[1] = localWorkSize[2] = 1;
}
else if (memView->cal()->dimSize_ == 2) {
globalWorkSize[0] = amd::alignUp(size[0], 16);
globalWorkSize[1] = amd::alignUp(size[1], 16);
globalWorkSize[2] = amd::alignUp(size[2], 1);
localWorkSize[0] = localWorkSize[1] = 16;
localWorkSize[2] = 1;
// Swap the Y and Z components, apparently HW expects
// layer in Z
if (swapLayer) {
globalWorkSize[2] = globalWorkSize[1];
globalWorkSize[1] = 1;
localWorkSize[2] = localWorkSize[1];
localWorkSize[1] = 1;
}
}
else {
globalWorkSize[0] = amd::alignUp(size[0], 8);
globalWorkSize[1] = amd::alignUp(size[1], 8);
globalWorkSize[2] = amd::alignUp(size[2], 4);
localWorkSize[0] = localWorkSize[1] = 8;
localWorkSize[2] = 4;
}
// Program kernels arguments for the blit operation
Memory* mem = memView;
setArgument(kernels_[fillType], 0, sizeof(cl_mem), &mem);
setArgument(kernels_[fillType], 1, sizeof(cl_float4), pattern);
setArgument(kernels_[fillType], 2, sizeof(cl_int4), pattern);
setArgument(kernels_[fillType], 3, sizeof(cl_uint4), pattern);
cl_int fillOrigin[4] = { (cl_int)origin[0],
(cl_int)origin[1],
(cl_int)origin[2], 0 };
cl_int fillSize[4] = { (cl_int)size[0],
(cl_int)size[1],
(cl_int)size[2], 0 };
if (swapLayer) {
fillOrigin[2] = fillOrigin[1];
fillOrigin[1] = 0;
fillSize[2] = fillSize[1];
fillSize[1] = 1;
}
setArgument(kernels_[fillType], 4, sizeof(fillOrigin), fillOrigin);
setArgument(kernels_[fillType], 5, sizeof(fillSize), fillSize);
// Find the type of image
uint32_t type = 0;
switch (newFormat.image_channel_data_type) {
case CL_SNORM_INT8:
case CL_SNORM_INT16:
case CL_UNORM_INT8:
case CL_UNORM_INT16:
case CL_UNORM_SHORT_565:
case CL_UNORM_SHORT_555:
case CL_UNORM_INT_101010:
case CL_HALF_FLOAT:
case CL_FLOAT:
type = 0;
break;
case CL_SIGNED_INT8:
case CL_SIGNED_INT16:
case CL_SIGNED_INT32:
type = 1;
break;
case CL_UNSIGNED_INT8:
case CL_UNSIGNED_INT16:
case CL_UNSIGNED_INT32:
type = 2;
break;
}
setArgument(kernels_[fillType], 6, sizeof(type), &type);
// Create ND range object for the kernel's execution
amd::NDRangeContainer ndrange(dim,
globalWorkOffset, globalWorkSize, localWorkSize);
// Execute the blit
address parameters = kernels_[fillType]->parameters().values();
result = gpu().submitKernelInternal(ndrange, *kernels_[fillType], parameters);
if (releaseView) {
delete memView;
}
synchronize();
return result;
}
bool
KernelBlitManager::runScheduler(
device::Memory& vqueue,
device::Memory& params,
uint paramIdx,
uint numSlots
) const
{
amd::ScopedLock k(lockXferOps_);
bool result = false;
size_t dim = 1;
size_t globalWorkOffset[1] = { 0 };
size_t globalWorkSize[1] = { numSlots / 32 };
size_t localWorkSize[1] = { 1 };
// Program kernels arguments
Memory* q = &gpuMem(vqueue);
Memory* p = &gpuMem(params);
setArgument(kernels_[Scheduler], 0, sizeof(cl_mem), &q);
setArgument(kernels_[Scheduler], 1, sizeof(cl_mem), &p);
setArgument(kernels_[Scheduler], 2, sizeof(uint), &paramIdx);
// Create ND range object for the kernel's execution
amd::NDRangeContainer ndrange(1,
globalWorkOffset, globalWorkSize, localWorkSize);
// Execute the blit
address parameters = kernels_[Scheduler]->parameters().values();
result = gpu().submitKernelInternal(ndrange, *kernels_[Scheduler], parameters);
synchronize();
return result;
}
amd::Memory*
KernelBlitManager::pinHostMemory(
const void* hostMem,
size_t pinSize,
size_t& partial) const
{
size_t pinAllocSize;
const static bool SysMem = true;
amd::Memory* amdMemory;
// Allign offset to 4K boundary (Vista/Win7 limitation)
char* tmpHost = const_cast<char*>(
amd::alignDown(reinterpret_cast<const char*>(hostMem),
PinnedMemoryAlignment));
// Find the partial size for unaligned copy
partial = reinterpret_cast<const char*>(hostMem) - tmpHost;
// Recalculate pin memory size
pinAllocSize = amd::alignUp(pinSize + partial, PinnedMemoryAlignment);
amdMemory = new(*context_)
amd::Buffer(*context_, CL_MEM_USE_HOST_PTR, pinAllocSize);
if ((amdMemory != NULL) && !amdMemory->create(tmpHost, SysMem)) {
amdMemory->release();
return NULL;
}
// Get device memory for this virtual device
// @note: This will force real memory pinning
amdMemory->setVirtualDevice(&gpu());
Memory* srcMemory = dev().getGpuMemory(amdMemory);
if (srcMemory == NULL) {
// Release all pinned memory and attempt pinning again
gpu().releasePinnedMem();
srcMemory = dev().getGpuMemory(amdMemory);
if (srcMemory == NULL) {
// Release memory
amdMemory->release();
amdMemory = NULL;
}
}
return amdMemory;
}
Memory*
KernelBlitManager::createView(
const Memory& parent,
const CalFormat& format
) const
{
assert(!parent.cal()->buffer_ && "View supports images only");
gpu::Memory* gpuImage = NULL;
gpuImage = new gpu::Image(dev(), parent.size(),
parent.cal()->width_,
parent.cal()->height_,
parent.cal()->depth_,
format.type_,
format.channelOrder_,
parent.cal()->imageType_);
// Create resource
if (NULL != gpuImage) {
bool result = false;
Resource::ImageViewParams params;
const Memory& gpuMem = static_cast<const Memory&>(parent);
params.owner_ = parent.owner();
params.level_ = 0;
params.layer_ = 0;
params.resource_ = &gpuMem;
params.memory_ = &gpuMem;
params.gpu_ = &gpu();
// Create memory object
result = gpuImage->create(Resource::ImageView, &params);
if (!result) {
delete gpuImage;
return NULL;
}
}
return gpuImage;
}
} // namespace gpu