SWDEV-305527 - Changes to handle memset blit kernel that takes width, height and depth. This also fixes SWDEV-317261.

Change-Id: Ic85f63a95d9d8f48884fc8c7fd95cbb496dfbbca


[ROCm/clr commit: 7fb80a027a]
Этот коммит содержится в:
kjayapra-amd
2021-10-19 13:00:51 -04:00
коммит произвёл Karthik Jayaprakash
родитель 8fb4e3478a
Коммит 31c0525344
11 изменённых файлов: 224 добавлений и 47 удалений
+2 -2
Просмотреть файл
@@ -553,8 +553,8 @@ bool HostBlitManager::copyImage(device::Memory& srcMemory, device::Memory& dstMe
}
bool HostBlitManager::fillBuffer(device::Memory& memory, const void* pattern, size_t patternSize,
const amd::Coord3D& origin, const amd::Coord3D& size, bool entire,
bool forceBlit) const {
const amd::Coord3D& surface, const amd::Coord3D& origin,
const amd::Coord3D& size, bool entire, bool forceBlit) const {
// Map memory
void* fillMem = memory.cpuMap(vDev_, (entire) ? Memory::CpuWriteOnly : 0);
if (fillMem == NULL) {
+2
Просмотреть файл
@@ -178,6 +178,7 @@ class BlitManager : public amd::HeapObject {
virtual bool fillBuffer(Memory& memory, //!< Memory object to fill with pattern
const void* pattern, //!< Pattern data
size_t patternSize, //!< Pattern size
const amd::Coord3D& surface, //!< Whole Surface of mem object.
const amd::Coord3D& origin, //!< Destination origin
const amd::Coord3D& size, //!< Size of the fill region
bool entire = false, //!< Entire buffer will be updated
@@ -331,6 +332,7 @@ class HostBlitManager : public device::BlitManager {
virtual bool fillBuffer(device::Memory& memory, //!< Memory object to fill with pattern
const void* pattern, //!< Pattern data
size_t patternSize, //!< Pattern size
const amd::Coord3D& surface, //!< Whole Surface of mem object.
const amd::Coord3D& origin, //!< Destination origin
const amd::Coord3D& size, //!< Size of the fill region
bool entire = false, //!< Entire buffer will be updated
+19
Просмотреть файл
@@ -27,6 +27,10 @@ const char* BlitLinearSourceCode = BLIT_KERNELS(
extern void __amd_fillBufferAligned(__global uchar*, __global ushort*, __global uint*,
__global ulong*, __constant uchar*, uint, ulong, ulong);
extern void __amd_fillBufferAligned2D(__global uchar*, __global ushort*, __global uint*,
__global ulong*, __constant uchar*, uint, ulong, ulong,
ulong, ulong);
extern void __amd_copyBuffer(__global uchar*, __global uchar*, ulong, ulong, ulong, uint);
extern void __amd_copyBufferAligned(__global uint*, __global uint*, ulong, ulong, ulong, uint);
@@ -47,6 +51,21 @@ const char* BlitLinearSourceCode = BLIT_KERNELS(
pattern, patternSize, offset, size);
}
__kernel void __amd_rocclr_fillBufferAligned2D(__global uchar* bufUChar,
__global ushort* bufUShort,
__global uint* bufUInt,
__global ulong* bufULong,
__constant uchar* pattern,
uint patternSize,
ulong offset,
ulong width,
ulong height,
ulong pitch) {
__amd_fillBufferAligned2D(bufUChar, bufUShort, bufUInt, bufULong,
pattern, patternSize, offset, width, height,
pitch);
}
__kernel void __amd_rocclr_copyBuffer(__global uchar* srcI, __global uchar* dstI,
ulong srcOrigin, ulong dstOrigin, ulong size,
uint remain) {
+4 -3
Просмотреть файл
@@ -2101,8 +2101,8 @@ bool KernelBlitManager::writeBufferRect(const void* srcHost, device::Memory& dst
}
bool KernelBlitManager::fillBuffer(device::Memory& memory, const void* pattern, size_t patternSize,
const amd::Coord3D& origin, const amd::Coord3D& size,
bool entire, bool forceBlit) const {
const amd::Coord3D& surface, const amd::Coord3D& origin,
const amd::Coord3D& size, bool entire, bool forceBlit) const {
amd::ScopedLock k(lockXferOps_);
bool result = false;
@@ -2110,7 +2110,8 @@ bool KernelBlitManager::fillBuffer(device::Memory& memory, const void* pattern,
if (setup_.disableFillBuffer_ || (!forceBlit && gpuMem(memory).isHostMemDirectAccess())) {
gpu().releaseGpuMemoryFence();
result = HostBlitManager::fillBuffer(memory, pattern, patternSize, origin, size, entire);
result = HostBlitManager::fillBuffer(memory, pattern, patternSize, size, origin, size,
entire);
synchronize();
return result;
} else {
+1
Просмотреть файл
@@ -344,6 +344,7 @@ class KernelBlitManager : public DmaBlitManager {
virtual bool fillBuffer(device::Memory& memory, //!< Memory object to fill with pattern
const void* pattern, //!< Pattern data
size_t patternSize, //!< Pattern size
const amd::Coord3D& surface, //!< Whole Surface of mem object.
const amd::Coord3D& origin, //!< Destination origin
const amd::Coord3D& size, //!< Size of the fill region
bool entire = false, //!< Entire buffer will be updated
+2 -2
Просмотреть файл
@@ -117,8 +117,8 @@ bool Segment::alloc(HSAILProgram& prog, amdgpu_hsa_elf_segment_t segment, size_t
if ((cpuAccess_ == nullptr) && zero && !prog.isInternal()) {
uint64_t pattern = 0;
size_t patternSize = ((size % sizeof(pattern)) == 0) ? sizeof(pattern) : 1;
prog.palDevice().xferMgr().fillBuffer(*gpuAccess_, &pattern, patternSize, amd::Coord3D(0),
amd::Coord3D(size));
prog.palDevice().xferMgr().fillBuffer(*gpuAccess_, &pattern, patternSize, amd::Coord3D(size),
amd::Coord3D(0), amd::Coord3D(size));
}
switch (segment) {
+3 -2
Просмотреть файл
@@ -810,7 +810,8 @@ bool VirtualGPU::createVirtualQueue(uint deviceQueueSize) {
uint64_t pattern = 0;
amd::Coord3D origin(0, 0, 0);
amd::Coord3D region(virtualQueue_->size(), 0, 0);
if (!dev().xferMgr().fillBuffer(*virtualQueue_, &pattern, sizeof(pattern), origin, region)) {
if (!dev().xferMgr().fillBuffer(*virtualQueue_, &pattern, sizeof(pattern), region, origin,
region)) {
return false;
}
@@ -1826,7 +1827,7 @@ bool VirtualGPU::fillMemory(cl_command_type type, amd::Memory* amdMemory, const
pattern = fillValue;
patternSize = elemSize;
}
result = blitMgr().fillBuffer(*memory, pattern, patternSize, realOrigin, realSize,
result = blitMgr().fillBuffer(*memory, pattern, patternSize, realSize, realOrigin, realSize,
amdMemory->isEntirelyCovered(origin, size), forceBlit);
if (nullptr != bufferFromImage) {
bufferFromImage->release();
+136 -3
Просмотреть файл
@@ -1941,8 +1941,35 @@ bool KernelBlitManager::writeBufferRect(const void* srcHost, device::Memory& dst
// ================================================================================================
bool KernelBlitManager::fillBuffer(device::Memory& memory, const void* pattern, size_t patternSize,
const amd::Coord3D& origin, const amd::Coord3D& size,
bool entire, bool forceBlit) const {
const amd::Coord3D& surface, const amd::Coord3D& origin,
const amd::Coord3D& size, bool entire, bool forceBlit) const {
guarantee(size[0] > 0 && size[1] > 0 && size[2] > 0, "Dimension cannot be 0");
if (size[1] == 1 && size[2] == 1) {
return fillBuffer1D(memory, pattern, patternSize, surface, origin, size, entire, forceBlit);
} else if (size[2] == 1) {
return fillBuffer2D(memory, pattern, patternSize, surface, origin, size, entire, forceBlit);
} else {
bool ret_val = true;
amd::Coord3D my_origin(origin);
amd::Coord3D my_region{surface[1], surface[2], size[2]};
amd::BufferRect rect;
rect.create(static_cast<size_t*>(my_origin), static_cast<size_t*>(my_region), surface[0], 0);
for (size_t slice = 0; slice < size[2]; ++slice) {
const size_t row_offset = rect.offset(0, 0, slice);
amd::Coord3D new_origin(row_offset, origin[1], origin[2]);
ret_val |= fillBuffer2D(memory, pattern, patternSize, surface, new_origin, size, entire,
forceBlit);
}
return ret_val;
}
}
// ================================================================================================
bool KernelBlitManager::fillBuffer1D(device::Memory& memory, const void* pattern, size_t patternSize,
const amd::Coord3D& surface, const amd::Coord3D& origin,
const amd::Coord3D& size, bool entire, bool forceBlit) const {
amd::ScopedLock k(lockXferOps_);
bool result = false;
@@ -1950,7 +1977,7 @@ bool KernelBlitManager::fillBuffer(device::Memory& memory, const void* pattern,
if (setup_.disableFillBuffer_ || (!forceBlit && memory.isHostMemDirectAccess())) {
// Stall GPU before CPU access
gpu().releaseGpuMemoryFence();
result = HostBlitManager::fillBuffer(memory, pattern, patternSize, origin, size, entire);
result = HostBlitManager::fillBuffer(memory, pattern, patternSize, size, origin, size, entire);
synchronize();
return result;
} else {
@@ -2044,6 +2071,112 @@ bool KernelBlitManager::fillBuffer(device::Memory& memory, const void* pattern,
return result;
}
// ================================================================================================
bool KernelBlitManager::fillBuffer2D(device::Memory& memory, const void* pattern,
size_t patternSize, const amd::Coord3D& surface,
const amd::Coord3D& origin, const amd::Coord3D& size,
bool entire, bool forceBlit) const {
amd::ScopedLock k(lockXferOps_);
bool result = false;
// Use host fill if memory has direct access
if (setup_.disableFillBuffer_ || (!forceBlit && memory.isHostMemDirectAccess())) {
// Stall GPU before CPU access
gpu().releaseGpuMemoryFence();
result = HostBlitManager::fillBuffer(memory, pattern, patternSize, size, origin, size, entire);
synchronize();
return result;
} else {
uint fillType = FillBufferAligned2D;
uint64_t fillSizeX = size[0]/patternSize;
uint64_t fillSizeY = size[1]/patternSize;
size_t globalWorkOffset[3] = {0, 0, 0};
size_t globalWorkSize[3] = {amd::alignUp(fillSizeX, 16),
amd::alignUp(fillSizeY, 16), 1};
size_t localWorkSize [3] = {16, 16, 1};
uint32_t alignment = (patternSize & 0x7) == 0 ?
sizeof(uint64_t) :
(patternSize & 0x3) == 0 ?
sizeof(uint32_t) :
(patternSize & 0x1) == 0 ?
sizeof(uint16_t) : sizeof(uint8_t);
cl_mem mem = as_cl<amd::Memory>(memory.owner());
if (alignment == sizeof(uint64_t)) {
setArgument(kernels_[fillType], 0, sizeof(cl_mem), nullptr);
setArgument(kernels_[fillType], 1, sizeof(cl_mem), nullptr);
setArgument(kernels_[fillType], 2, sizeof(cl_mem), nullptr);
setArgument(kernels_[fillType], 3, sizeof(cl_mem), &mem);
} else if (alignment == sizeof(uint32_t)) {
setArgument(kernels_[fillType], 0, sizeof(cl_mem), nullptr);
setArgument(kernels_[fillType], 1, sizeof(cl_mem), nullptr);
setArgument(kernels_[fillType], 2, sizeof(cl_mem), &mem);
setArgument(kernels_[fillType], 3, sizeof(cl_mem), nullptr);
} else if (alignment == sizeof(uint16_t)) {
setArgument(kernels_[fillType], 0, sizeof(cl_mem), nullptr);
setArgument(kernels_[fillType], 1, sizeof(cl_mem), &mem);
setArgument(kernels_[fillType], 2, sizeof(cl_mem), nullptr);
setArgument(kernels_[fillType], 3, sizeof(cl_mem), nullptr);
} else {
setArgument(kernels_[fillType], 0, sizeof(cl_mem), &mem);
setArgument(kernels_[fillType], 1, sizeof(cl_mem), nullptr);
setArgument(kernels_[fillType], 2, sizeof(cl_mem), nullptr);
setArgument(kernels_[fillType], 3, sizeof(cl_mem), nullptr);
}
Memory* gpuCB = dev().getRocMemory(constantBuffer_);
if (gpuCB == nullptr) {
return false;
}
// Find offset in the current constant buffer to allow multipel fills
uint32_t constBufOffset = ConstantBufferOffset();
auto constBuf = reinterpret_cast<address>(constantBuffer_->getHostMem()) + constBufOffset;
memcpy(constBuf, pattern, patternSize);
mem = as_cl<amd::Memory>(gpuCB->owner());
setArgument(kernels_[fillType], 4, sizeof(cl_mem), &mem, constBufOffset);
uint64_t mem_origin = static_cast<uint64_t>(origin[0]);
uint64_t width = static_cast<uint64_t>(size[0]);
uint64_t height = static_cast<uint64_t>(size[1]);
uint64_t pitch = static_cast<uint64_t>(surface[0]);
patternSize/= alignment;
mem_origin /= alignment;
setArgument(kernels_[fillType], 5, sizeof(uint32_t), &patternSize);
setArgument(kernels_[fillType], 6, sizeof(mem_origin), &mem_origin);
setArgument(kernels_[fillType], 7, sizeof(width), &width);
setArgument(kernels_[fillType], 8, sizeof(height), &height);
setArgument(kernels_[fillType], 9, sizeof(pitch), &pitch);
// Create ND range object for the kernel's execution
amd::NDRangeContainer ndrange(2, globalWorkOffset, globalWorkSize, localWorkSize);
// Execute the blit
address parameters = captureArguments(kernels_[fillType]);
result = gpu().submitKernelInternal(ndrange, *kernels_[fillType], parameters, nullptr);
releaseArguments(parameters);
}
synchronize();
return result;
}
// ================================================================================================
bool KernelBlitManager::fillBuffer3D(device::Memory& memory, const void* pattern,
size_t patternSize, const amd::Coord3D& surface,
const amd::Coord3D& origin, const amd::Coord3D& size,
bool entire, bool forceBlit) const {
ShouldNotReachHere();
return false;
}
// ================================================================================================
bool KernelBlitManager::copyBuffer(device::Memory& srcMemory, device::Memory& dstMemory,
const amd::Coord3D& srcOrigin, const amd::Coord3D& dstOrigin,
+42 -5
Просмотреть файл
@@ -231,6 +231,7 @@ class KernelBlitManager : public DmaBlitManager {
public:
enum {
FillBufferAligned = 0,
FillBufferAligned2D,
BlitCopyBuffer,
BlitCopyBufferAligned,
BlitCopyBufferRect,
@@ -366,12 +367,47 @@ class KernelBlitManager : public DmaBlitManager {
virtual bool fillBuffer(device::Memory& memory, //!< Memory object to fill with pattern
const void* pattern, //!< Pattern data
size_t patternSize, //!< Pattern size
const amd::Coord3D& surface, //!< Whole Surface of mem object.
const amd::Coord3D& origin, //!< Destination origin
const amd::Coord3D& size, //!< Size of the fill region
bool entire = false, //!< Entire buffer will be updated
bool forceBlit = false //!< Force GPU Blit for fill
) const;
//! Fills a buffer memory with a pattern data
virtual bool fillBuffer1D(device::Memory& memory, //!< Memory object to fill with pattern
const void* pattern, //!< Pattern data
size_t patternSize, //!< Pattern size
const amd::Coord3D& surface, //!< Whole Surface of mem object.
const amd::Coord3D& origin, //!< Destination origin
const amd::Coord3D& size, //!< Size of the fill region
bool entire = false, //!< Entire buffer will be updated
bool forceBlit = false //!< Force GPU Blit for fill
) const;
//! Fills a buffer memory with a pattern data
virtual bool fillBuffer2D(device::Memory& memory, //!< Memory object to fill with pattern
const void* pattern, //!< Pattern data
size_t patternSize, //!< Pattern size
const amd::Coord3D& surface, //!< Whole Surface of mem object.
const amd::Coord3D& origin, //!< Destination origin
const amd::Coord3D& size, //!< Size of the fill region
bool entire = false, //!< Entire buffer will be updated
bool forceBlit = false //!< Force GPU Blit for fill
) const;
//! Fills a buffer memory with a pattern data
virtual bool fillBuffer3D(device::Memory& memory, //!< Memory object to fill with pattern
const void* pattern, //!< Pattern data
size_t patternSize, //!< Pattern size
const amd::Coord3D& surface, //!< Whole Surface of mem object.
const amd::Coord3D& origin, //!< Destination origin
const amd::Coord3D& size, //!< Size of the fill region
bool entire = false, //!< Entire buffer will be updated
bool forceBlit = false //!< Force GPU Blit for fill
) const;
//! Fills an image memory with a pattern data
virtual bool fillImage(device::Memory& dstMemory, //!< Memory object to fill with pattern
const void* pattern, //!< Pattern data
@@ -483,11 +519,12 @@ class KernelBlitManager : public DmaBlitManager {
};
static const char* BlitName[KernelBlitManager::BlitTotal] = {
"__amd_rocclr_fillBufferAligned", "__amd_rocclr_copyBuffer", "__amd_rocclr_copyBufferAligned",
"__amd_rocclr_copyBufferRect", "__amd_rocclr_copyBufferRectAligned",
"__amd_rocclr_streamOpsWrite", "__amd_rocclr_streamOpsWait", "__amd_rocclr_scheduler",
"__amd_rocclr_gwsInit", "__amd_rocclr_fillImage", "__amd_rocclr_copyImage",
"__amd_rocclr_copyImage1DA", "__amd_rocclr_copyImageToBuffer", "__amd_rocclr_copyBufferToImage",
"__amd_rocclr_fillBufferAligned", "__amd_rocclr_fillBufferAligned2D", "__amd_rocclr_copyBuffer",
"__amd_rocclr_copyBufferAligned", "__amd_rocclr_copyBufferRect",
"__amd_rocclr_copyBufferRectAligned", "__amd_rocclr_streamOpsWrite", "__amd_rocclr_streamOpsWait",
"__amd_rocclr_scheduler", "__amd_rocclr_gwsInit", "__amd_rocclr_fillImage",
"__amd_rocclr_copyImage", "__amd_rocclr_copyImage1DA", "__amd_rocclr_copyImageToBuffer",
"__amd_rocclr_copyBufferToImage"
};
inline void KernelBlitManager::setArgument(amd::Kernel* kernel, size_t index,
+12 -30
Просмотреть файл
@@ -2211,8 +2211,9 @@ void VirtualGPU::submitUnmapMemory(amd::UnmapMemoryCommand& cmd) {
}
bool VirtualGPU::fillMemory(cl_command_type type, amd::Memory* amdMemory, const void* pattern,
size_t patternSize, const amd::Coord3D& origin,
const amd::Coord3D& size, bool forceBlit) {
size_t patternSize, const amd::Coord3D& surface,
const amd::Coord3D& origin, const amd::Coord3D& size,
bool forceBlit) {
// Make sure VirtualGPU has an exclusive access to the resources
amd::ScopedLock lock(execution());
@@ -2238,8 +2239,9 @@ bool VirtualGPU::fillMemory(cl_command_type type, amd::Memory* amdMemory, const
switch (type) {
case CL_COMMAND_SVM_MEMFILL:
case CL_COMMAND_FILL_BUFFER: {
amd::Coord3D realOrigin(origin[0]);
amd::Coord3D realSize(size[0]);
amd::Coord3D realSurf(surface[0], surface[1], surface[2]);
amd::Coord3D realOrigin(origin[0], origin[1], origin[2]);
amd::Coord3D realSize(size[0], size[1], size[2]);
// Reprogram fill parameters if it's an IMAGE1D_BUFFER object
if (imageBuffer) {
size_t elemSize = amdMemory->asImage()->getImageFormat().getElementSize();
@@ -2250,7 +2252,8 @@ bool VirtualGPU::fillMemory(cl_command_type type, amd::Memory* amdMemory, const
pattern = fillValue;
patternSize = elemSize;
}
result = blitMgr().fillBuffer(*memory, pattern, patternSize, realOrigin, realSize, entire, forceBlit);
result = blitMgr().fillBuffer(*memory, pattern, patternSize, realSurf, realOrigin,
realSize, entire, forceBlit);
break;
}
case CL_COMMAND_FILL_IMAGE: {
@@ -2275,30 +2278,9 @@ void VirtualGPU::submitFillMemory(amd::FillMemoryCommand& cmd) {
amd::ScopedLock lock(execution());
profilingBegin(cmd);
if (cmd.type() == CL_COMMAND_FILL_IMAGE) {
if (!fillMemory(cmd.type(), &cmd.memory(), cmd.pattern(), cmd.patternSize(),
cmd.origin(), cmd.size())) {
cmd.setStatus(CL_INVALID_OPERATION);
}
} else {
size_t width = cmd.size().c[0];
size_t height = cmd.size().c[1];
size_t depth = cmd.size().c[2];
size_t pitch = cmd.surface().c[0];
amd::Coord3D origin = cmd.origin();
amd::Coord3D region{cmd.surface().c[1], cmd.surface().c[2], depth};
amd::BufferRect rect;
rect.create(static_cast<size_t*>(origin), static_cast<size_t*>(region),
pitch, 0);
for (size_t slice = 0; slice < depth; slice++) {
for (size_t row = 0; row < height; row++) {
const size_t rowOffset = rect.offset(0, row, slice);
if (!fillMemory(cmd.type(), &cmd.memory(), cmd.pattern(), cmd.patternSize(),
amd::Coord3D{rowOffset, 0, 0}, amd::Coord3D{width, 1, 1})) {
cmd.setStatus(CL_INVALID_OPERATION);
}
}
}
if (!fillMemory(cmd.type(), &cmd.memory(), cmd.pattern(), cmd.patternSize(),
cmd.surface(), cmd.origin(), cmd.size())) {
cmd.setStatus(CL_INVALID_OPERATION);
}
profilingEnd(cmd);
}
@@ -2636,7 +2618,7 @@ bool VirtualGPU::createVirtualQueue(uint deviceQueueSize)
amd::Coord3D origin(0, 0, 0);
amd::Coord3D region(virtualQueue_->getSize(), 0, 0);
if (!dev().xferMgr().fillBuffer(*vqMem, &pattern, sizeof(pattern), origin, region)) {
if (!dev().xferMgr().fillBuffer(*vqMem, &pattern, sizeof(pattern), region, origin, region)) {
return false;
}
+1
Просмотреть файл
@@ -436,6 +436,7 @@ class VirtualGPU : public device::VirtualDevice {
amd::Memory* amdMemory, //!< memory object to fill
const void* pattern, //!< pattern to fill the memory
size_t patternSize, //!< pattern size
const amd::Coord3D& surface, //!< Whole Surface of mem object.
const amd::Coord3D& origin, //!< memory origin
const amd::Coord3D& size, //!< memory size for filling
bool forceBlit = false //!< force shader blit path