SWDEV-434298 - Change copy buffer kernel
The new copy kernel can limit the number of launched workgoups. It can copy in chunks of 16 bytes or 4 bytes. Workgoup size is increased to 512 or 1024 Change-Id: Ic3fefa2d5bda6afebd1acc4d41ad310b138af6df
Cette révision appartient à :
@@ -116,9 +116,34 @@ const char* BlitLinearSourceCode = BLIT_KERNELS(
|
||||
}
|
||||
|
||||
__kernel void __amd_rocclr_copyBuffer(__global uchar* srcI, __global uchar* dstI,
|
||||
ulong srcOrigin, ulong dstOrigin, ulong size,
|
||||
uint remain) {
|
||||
__amd_copyBuffer(srcI, dstI, srcOrigin, dstOrigin, size, remain);
|
||||
ulong srcOrigin, ulong dstOrigin, ulong size, uint remainder,
|
||||
uint aligned_size, ulong end_ptr, uint next_chunk) {
|
||||
ulong id = get_global_id(0);
|
||||
ulong id_remainder = id;
|
||||
|
||||
__global uchar* src = srcI + srcOrigin;
|
||||
__global uchar* dst = dstI + dstOrigin;
|
||||
|
||||
if (aligned_size == sizeof(ulong2)) {
|
||||
__global ulong2* srcD = (__global ulong2*)(src);
|
||||
__global ulong2* dstD = (__global ulong2*)(dst);
|
||||
while ((ulong)(&dstD[id]) < end_ptr) {
|
||||
dstD[id] = srcD[id];
|
||||
id += next_chunk;
|
||||
}
|
||||
} else {
|
||||
__global uint* srcD = (__global uint*)(src);
|
||||
__global uint* dstD = (__global uint*)(dst);
|
||||
while ((ulong)(&dstD[id]) < end_ptr) {
|
||||
dstD[id] = srcD[id];
|
||||
id += next_chunk;
|
||||
}
|
||||
}
|
||||
if ((remainder != 0) && (id_remainder == 0)) {
|
||||
for (ulong i = size - remainder; i < size; ++i) {
|
||||
dst[i] = src[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void __amd_rocclr_copyBufferAligned(__global uint* src, __global uint* dst,
|
||||
|
||||
@@ -911,7 +911,7 @@ bool KernelBlitManager::copyBufferToImage(device::Memory& srcMemory, device::Mem
|
||||
// Step 2. Initiate compute transfer with all staging buffers
|
||||
for (uint i = 0; i < MaxXferBuffers; ++i) {
|
||||
if (copySize > 0) {
|
||||
if (!copyBufferToImageKernel(*xferBuf[i], dstMemory, xferSrc, dst, xferRect, false,
|
||||
if (!copyBufferToImageKernel(*xferBuf[i], dstMemory, xferSrc, dst, xferRect, false,
|
||||
0UL, 0UL, copyMetadata)) {
|
||||
transfer = false;
|
||||
break;
|
||||
@@ -2276,74 +2276,57 @@ bool KernelBlitManager::copyBuffer(device::Memory& srcMemory, device::Memory& ds
|
||||
bool result = false;
|
||||
|
||||
if (!gpuMem(srcMemory).isHostMemDirectAccess() && !gpuMem(dstMemory).isHostMemDirectAccess()) {
|
||||
uint blitType = BlitCopyBuffer;
|
||||
size_t dim = 1;
|
||||
size_t globalWorkOffset[3] = {0, 0, 0};
|
||||
size_t globalWorkSize = 0;
|
||||
size_t localWorkSize = 0;
|
||||
constexpr uint32_t kBlitType = BlitCopyBuffer;
|
||||
constexpr uint32_t kMaxAlignment = 2 * sizeof(uint64_t);
|
||||
amd::Coord3D size(sizeIn[0]);
|
||||
|
||||
const static uint CopyBuffAlignment[3] = {16, 4, 1};
|
||||
amd::Coord3D size(sizeIn[0], sizeIn[1], sizeIn[2]);
|
||||
// Check alignments for source and destination
|
||||
bool aligned = ((srcOrigin[0] % kMaxAlignment) == 0) && ((dstOrigin[0] % kMaxAlignment) == 0);
|
||||
uint32_t aligned_size = (aligned) ? kMaxAlignment : sizeof(uint32_t);
|
||||
|
||||
uint i;
|
||||
for (i = 0; i < sizeof(CopyBuffAlignment) / sizeof(uint); i++) {
|
||||
// Check source alignments
|
||||
bool aligned = ((srcOrigin[0] % CopyBuffAlignment[i]) == 0);
|
||||
// Check destination alignments
|
||||
aligned &= ((dstOrigin[0] % CopyBuffAlignment[i]) == 0);
|
||||
// Check copy size alignment in the first dimension
|
||||
aligned &= ((sizeIn[0] % CopyBuffAlignment[i]) == 0);
|
||||
|
||||
if (aligned) {
|
||||
if (CopyBuffAlignment[i] != 1) {
|
||||
blitType = BlitCopyBufferAligned;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t remain;
|
||||
if (blitType == BlitCopyBufferAligned) {
|
||||
size.c[0] /= CopyBuffAlignment[i];
|
||||
} else {
|
||||
remain = size[0] % 4;
|
||||
size.c[0] /= 4;
|
||||
size.c[0] += 1;
|
||||
}
|
||||
// Setup copy size accordingly to the alignment
|
||||
uint32_t remainder = size[0] % aligned_size;
|
||||
size.c[0] /= aligned_size;
|
||||
size.c[0] += (remainder != 0) ? 1 : 0;
|
||||
|
||||
// Program the dispatch dimensions
|
||||
localWorkSize = 256;
|
||||
globalWorkSize = amd::alignUp(size[0], 256);
|
||||
const size_t localWorkSize = (aligned) ? 512 : 1024;
|
||||
size_t globalWorkSize = std::min(dev().settings().limit_blit_wg_ * localWorkSize, size[0]);
|
||||
globalWorkSize = amd::alignUp(globalWorkSize, localWorkSize);
|
||||
|
||||
// Program kernels arguments for the blit operation
|
||||
Memory* mem = &gpuMem(srcMemory);
|
||||
setArgument(kernels_[blitType], 0, sizeof(cl_mem), &mem);
|
||||
setArgument(kernels_[kBlitType], 0, sizeof(cl_mem), &mem);
|
||||
mem = &gpuMem(dstMemory);
|
||||
setArgument(kernels_[blitType], 1, sizeof(cl_mem), &mem);
|
||||
setArgument(kernels_[kBlitType], 1, sizeof(cl_mem), &mem);
|
||||
|
||||
// Program source origin
|
||||
uint64_t srcOffset = srcOrigin[0] / CopyBuffAlignment[i];
|
||||
setArgument(kernels_[blitType], 2, sizeof(srcOffset), &srcOffset);
|
||||
uint64_t srcOffset = srcOrigin[0];
|
||||
setArgument(kernels_[kBlitType], 2, sizeof(srcOffset), &srcOffset);
|
||||
|
||||
// Program destinaiton origin
|
||||
uint64_t dstOffset = dstOrigin[0] / CopyBuffAlignment[i];
|
||||
setArgument(kernels_[blitType], 3, sizeof(dstOffset), &dstOffset);
|
||||
uint64_t dstOffset = dstOrigin[0];
|
||||
setArgument(kernels_[kBlitType], 3, sizeof(dstOffset), &dstOffset);
|
||||
|
||||
uint64_t copySize = size[0];
|
||||
setArgument(kernels_[blitType], 4, sizeof(copySize), ©Size);
|
||||
uint64_t copySize = sizeIn[0];
|
||||
setArgument(kernels_[kBlitType], 4, sizeof(copySize), ©Size);
|
||||
|
||||
if (blitType == BlitCopyBufferAligned) {
|
||||
int32_t alignment = CopyBuffAlignment[i];
|
||||
setArgument(kernels_[blitType], 5, sizeof(alignment), &alignment);
|
||||
} else {
|
||||
setArgument(kernels_[blitType], 5, sizeof(remain), &remain);
|
||||
}
|
||||
setArgument(kernels_[kBlitType], 5, sizeof(remainder), &remainder);
|
||||
setArgument(kernels_[kBlitType], 6, sizeof(aligned_size), &aligned_size);
|
||||
|
||||
// End pointer is the aligned copy size
|
||||
uint64_t end_ptr = dstMemory.virtualAddress() + sizeIn[0] - remainder;
|
||||
setArgument(kernels_[kBlitType], 7, sizeof(end_ptr), &end_ptr);
|
||||
|
||||
uint32_t next_chunk = globalWorkSize;
|
||||
setArgument(kernels_[kBlitType], 8, sizeof(next_chunk), &next_chunk);
|
||||
|
||||
// Create ND range object for the kernel's execution
|
||||
amd::NDRangeContainer ndrange(1, globalWorkOffset, &globalWorkSize, &localWorkSize);
|
||||
amd::NDRangeContainer ndrange(1, nullptr, &globalWorkSize, &localWorkSize);
|
||||
|
||||
// Execute the blit
|
||||
address parameters = kernels_[blitType]->parameters().values();
|
||||
result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], parameters);
|
||||
address parameters = kernels_[kBlitType]->parameters().values();
|
||||
result = gpu().submitKernelInternal(ndrange, *kernels_[kBlitType], parameters);
|
||||
} else {
|
||||
result = DmaBlitManager::copyBuffer(srcMemory, dstMemory, srcOrigin, dstOrigin, sizeIn, entire, copyMetadata);
|
||||
}
|
||||
@@ -2359,10 +2342,10 @@ bool KernelBlitManager::fillImage(device::Memory& memory, const void* pattern,
|
||||
amd::ScopedLock k(lockXferOps_);
|
||||
bool result = false;
|
||||
constexpr size_t kFillImageThreshold = 256 * 256;
|
||||
|
||||
|
||||
// Use host fill if memory has direct access and image is small
|
||||
if (setup_.disableFillImage_ ||
|
||||
(gpuMem(memory).isHostMemDirectAccess() &&
|
||||
(gpuMem(memory).isHostMemDirectAccess() &&
|
||||
(size.c[0] * size.c[1] * size.c[2]) <= kFillImageThreshold)) {
|
||||
gpu().releaseGpuMemoryFence();
|
||||
|
||||
|
||||
@@ -2236,7 +2236,6 @@ bool KernelBlitManager::fillBuffer2D(device::Memory& memory, const void* pattern
|
||||
setArgument(kernels_[fillType], 8, sizeof(height), &height);
|
||||
setArgument(kernels_[fillType], 9, sizeof(pitch), &pitch);
|
||||
|
||||
|
||||
// Create ND range object for the kernel's execution
|
||||
amd::NDRangeContainer ndrange(2, globalWorkOffset, globalWorkSize, localWorkSize);
|
||||
|
||||
@@ -2298,76 +2297,57 @@ bool KernelBlitManager::copyBuffer(device::Memory& srcMemory, device::Memory& ds
|
||||
}
|
||||
|
||||
if (!result) {
|
||||
uint blitType = BlitCopyBuffer;
|
||||
size_t dim = 1;
|
||||
size_t globalWorkOffset[3] = {0, 0, 0};
|
||||
size_t globalWorkSize = 0;
|
||||
size_t localWorkSize = 0;
|
||||
constexpr uint32_t kBlitType = BlitCopyBuffer;
|
||||
constexpr uint32_t kMaxAlignment = 2 * sizeof(uint64_t);
|
||||
amd::Coord3D size(sizeIn[0]);
|
||||
|
||||
// todo LC shows much better performance with the unaligned version
|
||||
const static uint CopyBuffAlignment[3] = {1 /*16*/, 1 /*4*/, 1};
|
||||
amd::Coord3D size(sizeIn[0], sizeIn[1], sizeIn[2]);
|
||||
// Check alignments for source and destination
|
||||
bool aligned = ((srcOrigin[0] % kMaxAlignment) == 0) && ((dstOrigin[0] % kMaxAlignment) == 0);
|
||||
uint32_t aligned_size = (aligned) ? kMaxAlignment : sizeof(uint32_t);
|
||||
|
||||
uint i;
|
||||
for (i = 0; i < sizeof(CopyBuffAlignment) / sizeof(uint); i++) {
|
||||
bool aligned = false;
|
||||
// Check source alignments
|
||||
aligned = ((srcOrigin[0] % CopyBuffAlignment[i]) == 0);
|
||||
// Check destination alignments
|
||||
aligned &= ((dstOrigin[0] % CopyBuffAlignment[i]) == 0);
|
||||
// Check copy size alignment in the first dimension
|
||||
aligned &= ((sizeIn[0] % CopyBuffAlignment[i]) == 0);
|
||||
|
||||
if (aligned) {
|
||||
if (CopyBuffAlignment[i] != 1) {
|
||||
blitType = BlitCopyBufferAligned;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t remain;
|
||||
if (blitType == BlitCopyBufferAligned) {
|
||||
size.c[0] /= CopyBuffAlignment[i];
|
||||
} else {
|
||||
remain = size[0] % 4;
|
||||
size.c[0] /= 4;
|
||||
size.c[0] += 1;
|
||||
}
|
||||
// Setup copy size accordingly to the alignment
|
||||
uint32_t remainder = size[0] % aligned_size;
|
||||
size.c[0] /= aligned_size;
|
||||
size.c[0] += (remainder != 0) ? 1 : 0;
|
||||
|
||||
// Program the dispatch dimensions
|
||||
localWorkSize = 256;
|
||||
globalWorkSize = amd::alignUp(size[0], 256);
|
||||
const size_t localWorkSize = (aligned) ? 512 : 1024;
|
||||
size_t globalWorkSize = std::min(dev().settings().limit_blit_wg_ * localWorkSize, size[0]);
|
||||
globalWorkSize = amd::alignUp(globalWorkSize, localWorkSize);
|
||||
|
||||
// Program kernels arguments for the blit operation
|
||||
cl_mem mem = as_cl<amd::Memory>(srcMemory.owner());
|
||||
setArgument(kernels_[blitType], 0, sizeof(cl_mem), &mem, 0, &srcMemory);
|
||||
setArgument(kernels_[kBlitType], 0, sizeof(cl_mem), &mem, 0, &srcMemory);
|
||||
mem = as_cl<amd::Memory>(dstMemory.owner());
|
||||
setArgument(kernels_[blitType], 1, sizeof(cl_mem), &mem, 0, &dstMemory);
|
||||
setArgument(kernels_[kBlitType], 1, sizeof(cl_mem), &mem, 0, &dstMemory);
|
||||
|
||||
// Program source origin
|
||||
uint64_t srcOffset = srcOrigin[0] / CopyBuffAlignment[i];
|
||||
setArgument(kernels_[blitType], 2, sizeof(srcOffset), &srcOffset);
|
||||
uint64_t srcOffset = srcOrigin[0];
|
||||
setArgument(kernels_[kBlitType], 2, sizeof(srcOffset), &srcOffset);
|
||||
|
||||
// Program destinaiton origin
|
||||
uint64_t dstOffset = dstOrigin[0] / CopyBuffAlignment[i];
|
||||
setArgument(kernels_[blitType], 3, sizeof(dstOffset), &dstOffset);
|
||||
uint64_t dstOffset = dstOrigin[0];
|
||||
setArgument(kernels_[kBlitType], 3, sizeof(dstOffset), &dstOffset);
|
||||
|
||||
uint64_t copySize = size[0];
|
||||
setArgument(kernels_[blitType], 4, sizeof(copySize), ©Size);
|
||||
uint64_t copySize = sizeIn[0];
|
||||
setArgument(kernels_[kBlitType], 4, sizeof(copySize), ©Size);
|
||||
|
||||
if (blitType == BlitCopyBufferAligned) {
|
||||
int32_t alignment = CopyBuffAlignment[i];
|
||||
setArgument(kernels_[blitType], 5, sizeof(alignment), &alignment);
|
||||
} else {
|
||||
setArgument(kernels_[blitType], 5, sizeof(remain), &remain);
|
||||
}
|
||||
setArgument(kernels_[kBlitType], 5, sizeof(remainder), &remainder);
|
||||
setArgument(kernels_[kBlitType], 6, sizeof(aligned_size), &aligned_size);
|
||||
|
||||
// End pointer is the aligned copy size
|
||||
uint64_t end_ptr = dstMemory.virtualAddress() + sizeIn[0] - remainder;
|
||||
setArgument(kernels_[kBlitType], 7, sizeof(end_ptr), &end_ptr);
|
||||
|
||||
uint32_t next_chunk = globalWorkSize;
|
||||
setArgument(kernels_[kBlitType], 8, sizeof(next_chunk), &next_chunk);
|
||||
|
||||
// Create ND range object for the kernel's execution
|
||||
amd::NDRangeContainer ndrange(1, globalWorkOffset, &globalWorkSize, &localWorkSize);
|
||||
amd::NDRangeContainer ndrange(1, nullptr, &globalWorkSize, &localWorkSize);
|
||||
|
||||
// Execute the blit
|
||||
address parameters = captureArguments(kernels_[blitType]);
|
||||
result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], parameters, nullptr);
|
||||
address parameters = captureArguments(kernels_[kBlitType]);
|
||||
result = gpu().submitKernelInternal(ndrange, *kernels_[kBlitType], parameters, nullptr);
|
||||
releaseArguments(parameters);
|
||||
}
|
||||
|
||||
|
||||
Référencer dans un nouveau ticket
Bloquer un utilisateur