SWDEV-292525 - [vdi] Path to streamOps shaders

Implementation to use a blit kernel to perform
a hipStreamWait/write instead of an AQL packet.

Change-Id: I462671ed5cec37144dfe97ff66439249196117c1
此提交包含在:
Sourabh
2021-08-31 16:13:17 +00:00
提交者 Sourabh Betigeri
父節點 60295a9da9
當前提交 cbb8d82bdb
共有 6 個檔案被更改,包括 169 行新增45 行删除
+13 -1
查看文件
@@ -34,6 +34,10 @@ const char* BlitSourceCode = BLIT_KERNELS(
extern void __amd_fillBufferAligned(__global uchar*, __global ushort*, __global uint*, __global ulong*,
__constant uchar*, uint, ulong, ulong);
extern void __amd_streamOpsWrite(__global uint*, __global ulong*, ulong, ulong);
extern void __amd_streamOpsWait(__global uint*,__global ulong*, ulong, ulong, ulong);
__kernel void __amd_rocclr_copyBufferRect(__global uchar* src, __global uchar* dst, ulong4 srcRect,
ulong4 dstRect, ulong4 size) {
__amd_copyBufferRect(src, dst, srcRect, dstRect, size);
@@ -109,6 +113,14 @@ const char* BlitSourceCode = BLIT_KERNELS(
int4 patternINT4, uint4 patternUINT4, int4 origin, int4 size,
uint type) {
__amd_fillImage(image, patternFLOAT4, patternINT4, patternUINT4, origin, size, type);
});
}
__kernel void __amd_rocclr_streamOpsWait(__global uint* ptrInt, __global ulong* ptrUlong, ulong value, ulong flags,
ulong mask) {
__amd_streamOpsWait(ptrInt, ptrUlong, value, flags, mask);
}
__kernel void __amd_rocclr_streamOpsWrite(__global uint* ptrInt, __global ulong* ptrUlong, ulong value, ulong sizeBytes) {
__amd_streamOpsWrite(ptrInt, ptrUlong, value, sizeBytes);
});
} // namespace device
+77
查看文件
@@ -2278,6 +2278,83 @@ bool KernelBlitManager::fillImage(device::Memory& memory, const void* pattern,
return result;
}
// ================================================================================================
bool KernelBlitManager::streamOpsWrite(device::Memory& memory, uint64_t value,
size_t sizeBytes) const {
amd::ScopedLock k(lockXferOps_);
bool result = false;
uint blitType = StreamOpsWrite;
size_t dim = 1;
size_t globalWorkOffset[1] = { 0 };
size_t globalWorkSize[1] = { 1 };
size_t localWorkSize[1] = { 1 };
// Program kernels arguments for the write operation
cl_mem mem = as_cl<amd::Memory>(memory.owner());
bool is32BitWrite = (sizeBytes == sizeof(uint32_t)) ? true : false;
// Program kernels arguments for the write operation
if (is32BitWrite) {
setArgument(kernels_[blitType], 0, sizeof(cl_mem), &mem);
setArgument(kernels_[blitType], 1, sizeof(cl_mem), nullptr);
setArgument(kernels_[blitType], 2, sizeof(uint32_t), &value);
} else {
setArgument(kernels_[blitType], 0, sizeof(cl_mem), nullptr);
setArgument(kernels_[blitType], 1, sizeof(cl_mem), &mem);
setArgument(kernels_[blitType], 2, sizeof(uint64_t), &value);
}
setArgument(kernels_[blitType], 3, sizeof(size_t), &sizeBytes);
// Create ND range object for the kernel's execution
amd::NDRangeContainer ndrange(dim, globalWorkOffset, globalWorkSize, localWorkSize);
// Execute the blit
address parameters = captureArguments(kernels_[blitType]);
result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], parameters, nullptr);
releaseArguments(parameters);
synchronize();
return result;
}
// ================================================================================================
bool KernelBlitManager::streamOpsWait(device::Memory& memory, uint64_t value, size_t sizeBytes,
uint64_t flags, uint64_t mask) const {
amd::ScopedLock k(lockXferOps_);
bool result = false;
uint blitType = StreamOpsWait;
size_t dim = 1;
size_t globalWorkOffset[1] = { 0 };
size_t globalWorkSize[1] = { 1 };
size_t localWorkSize[1] = { 1 };
// Program kernels arguments for the wait operation
cl_mem mem = as_cl<amd::Memory>(memory.owner());
bool is32BitWait = (sizeBytes == sizeof(uint32_t)) ? true : false;
// Program kernels arguments for the wait operation
if (is32BitWait) {
setArgument(kernels_[blitType], 0, sizeof(cl_mem), &mem);
setArgument(kernels_[blitType], 1, sizeof(cl_mem), nullptr);
setArgument(kernels_[blitType], 2, sizeof(uint32_t), &value);
setArgument(kernels_[blitType], 3, sizeof(uint32_t), &flags);
setArgument(kernels_[blitType], 4, sizeof(uint32_t), &mask);
} else {
setArgument(kernels_[blitType], 0, sizeof(cl_mem), nullptr);
setArgument(kernels_[blitType], 1, sizeof(cl_mem), &mem);
setArgument(kernels_[blitType], 2, sizeof(uint64_t), &value);
setArgument(kernels_[blitType], 3, sizeof(uint64_t), &flags);
setArgument(kernels_[blitType], 4, sizeof(uint64_t), &mask);
}
// Create ND range object for the kernel's execution
amd::NDRangeContainer ndrange(dim, globalWorkOffset, globalWorkSize, localWorkSize);
// Execute the blit
address parameters = captureArguments(kernels_[blitType]);
result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], parameters, nullptr);
releaseArguments(parameters);
synchronize();
return result;
}
// ================================================================================================
amd::Memory* DmaBlitManager::pinHostMemory(const void* hostMem, size_t pinSize,
size_t& partial) const {
size_t pinAllocSize;
+17 -1
查看文件
@@ -242,6 +242,8 @@ class KernelBlitManager : public DmaBlitManager {
FillImage,
Scheduler,
GwsInit,
StreamOpsWrite,
StreamOpsWait,
BlitTotal
};
@@ -387,6 +389,20 @@ class KernelBlitManager : public DmaBlitManager {
bool RunGwsInit(uint32_t value //!< Initial value for GWS resource
) const;
//! Stream memory write operation - Write a 'value' at 'memory'.
bool streamOpsWrite(device::Memory& memory, //!< Memory to write the 'value'
uint64_t value,
size_t sizeBytes
) const;
//! Stream memory ops- Waits for a 'value' at 'memory' and wait is released based on compare op.
bool streamOpsWait(device::Memory& memory, //!< Memory contents to compare the 'value' against
uint64_t value,
size_t sizeBytes,
uint64_t flags,
uint64_t mask
) const;
virtual amd::Monitor* lockXfer() const { return &lockXferOps_; }
private:
@@ -466,7 +482,7 @@ static const char* BlitName[KernelBlitManager::BlitTotal] = {
"__amd_rocclr_copyBufferRectAligned", "__amd_rocclr_copyBuffer",
"__amd_rocclr_copyBufferAligned", "__amd_rocclr_fillBufferAligned",
"__amd_rocclr_fillImage", "__amd_rocclr_scheduler",
"__amd_rocclr_gwsInit"
"__amd_rocclr_gwsInit", "__amd_rocclr_streamOpsWrite", "__amd_rocclr_streamOpsWait"
};
inline void KernelBlitManager::setArgument(amd::Kernel* kernel, size_t index,
+55 -39
查看文件
@@ -2257,7 +2257,7 @@ void VirtualGPU::submitStreamOperation(amd::StreamOperationCommand& cmd) {
profilingBegin(cmd);
const cl_command_type type = cmd.type();
const int64_t value = cmd.value();
const uint64_t value = cmd.value();
const uint64_t mask = cmd.mask();
const unsigned int flags = cmd.flags();
const size_t sizeBytes = cmd.sizeBytes();
@@ -2267,44 +2267,58 @@ void VirtualGPU::submitStreamOperation(amd::StreamOperationCommand& cmd) {
Memory* memory = dev().getRocMemory(amdMemory);
if (type == ROCCLR_COMMAND_STREAM_WAIT_VALUE) {
hsa_amd_barrier_value_packet_t aqlPacket;
hsa_amd_vendor_packet_header_t header;
hsa_signal_t signal;
Buffer* buff = static_cast<Buffer*>(memory);
if (GPU_STREAMOPS_CP_WAIT) {
hsa_amd_barrier_value_packet_t aqlPacket;
hsa_amd_vendor_packet_header_t header;
hsa_signal_t signal;
Buffer* buff = static_cast<Buffer*>(memory);
header.header = kBarrierVendorPacketHeader;
header.AmdFormat = HSA_AMD_PACKET_TYPE_BARRIER_VALUE;
aqlPacket.signal = buff->getSignal();
aqlPacket.completion_signal = Barriers().ActiveSignal();
header.header = kBarrierVendorPacketHeader;
header.AmdFormat = HSA_AMD_PACKET_TYPE_BARRIER_VALUE;
aqlPacket.signal = buff->getSignal();
aqlPacket.completion_signal = Barriers().ActiveSignal();
// mask is always applied on value at signal before performing
// the comparision defiend by 'condition'
switch (flags) {
case ROCCLR_STREAM_WAIT_VALUE_GTE:
aqlPacket.value = value;
aqlPacket.mask = mask;
aqlPacket.cond = HSA_SIGNAL_CONDITION_GTE;
break;
case ROCCLR_STREAM_WAIT_VALUE_EQ:
aqlPacket.value = value;
aqlPacket.mask = mask;
aqlPacket.cond = HSA_SIGNAL_CONDITION_EQ;
break;
case ROCCLR_STREAM_WAIT_VALUE_AND:
aqlPacket.value = 0;
aqlPacket.mask = (value & mask);
aqlPacket.cond = HSA_SIGNAL_CONDITION_NE;
break;
case ROCCLR_STREAM_WAIT_VALUE_NOR:
aqlPacket.value = ~value & mask;
aqlPacket.mask = ~value & mask;
aqlPacket.cond = HSA_SIGNAL_CONDITION_NE;
break;
default:
ShouldNotReachHere();
break;
// mask is always applied on value at signal before performing
// the comparision defiend by 'condition'
switch (flags) {
case ROCCLR_STREAM_WAIT_VALUE_GTE:
aqlPacket.value = value;
aqlPacket.mask = mask;
aqlPacket.cond = HSA_SIGNAL_CONDITION_GTE;
break;
case ROCCLR_STREAM_WAIT_VALUE_EQ:
aqlPacket.value = value;
aqlPacket.mask = mask;
aqlPacket.cond = HSA_SIGNAL_CONDITION_EQ;
break;
case ROCCLR_STREAM_WAIT_VALUE_AND:
aqlPacket.value = 0;
aqlPacket.mask = (value & mask);
aqlPacket.cond = HSA_SIGNAL_CONDITION_NE;
break;
case ROCCLR_STREAM_WAIT_VALUE_NOR:
aqlPacket.value = ~value & mask;
aqlPacket.mask = ~value & mask;
aqlPacket.cond = HSA_SIGNAL_CONDITION_NE;
break;
default:
ShouldNotReachHere();
break;
}
dispatchBarrierValuePacket(&aqlPacket, header);
}
// Use a blit kernel to perform the wait operation
else {
// mask is applied on value before performing
// the comparision defined by 'condition'
bool result = static_cast<KernelBlitManager&>(blitMgr()).streamOpsWait(*memory, value,
sizeBytes, flags, mask);
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "Waiting for value: 0x%lx."
" Flags: 0x%lx mask: 0x%lx", value, flags, mask);
if (!result) {
LogError("submitStreamOperation: Wait failed!");
}
}
dispatchBarrierValuePacket(&aqlPacket, header);
} else if (type == ROCCLR_COMMAND_STREAM_WRITE_VALUE) {
amd::Coord3D origin(offset);
amd::Coord3D size(sizeBytes);
@@ -2312,10 +2326,12 @@ void VirtualGPU::submitStreamOperation(amd::StreamOperationCommand& cmd) {
// Ensure memory ordering preceding the write
dispatchBarrierPacket(kBarrierPacketReleaseHeader);
if (!fillMemory(CL_COMMAND_FILL_BUFFER, amdMemory, &value, sizeBytes, origin, size, true)) {
cmd.setStatus(CL_INVALID_OPERATION);
}
bool result = static_cast<KernelBlitManager&>(blitMgr()).streamOpsWrite(*memory, value,
sizeBytes);
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "Writing value: 0x%lx", value);
if (!result) {
LogError("submitStreamOperation: Write failed!");
}
} else {
ShouldNotReachHere();
}
+4 -3
查看文件
@@ -666,9 +666,10 @@ class StreamOperationCommand : public OneMemoryArgCommand {
offset_(offset),
sizeBytes_(sizeBytes) {
// Sanity check
assert((cmdType == ROCCLR_COMMAND_STREAM_WRITE_VALUE ||
(cmdType == ROCCLR_COMMAND_STREAM_WAIT_VALUE &&
memory_->getMemFlags() & ROCCLR_MEM_HSA_SIGNAL_MEMORY)) &&
assert((cmdType == ROCCLR_COMMAND_STREAM_WRITE_VALUE) ||
(cmdType == ROCCLR_COMMAND_STREAM_WAIT_VALUE) ||
(cmdType == ROCCLR_COMMAND_STREAM_WAIT_VALUE && GPU_STREAMOPS_CP_WAIT &&
(memory_->getMemFlags() & ROCCLR_MEM_HSA_SIGNAL_MEMORY)) &&
"Invalid Stream Operation");
}
+3 -1
查看文件
@@ -263,7 +263,9 @@ release(size_t, PAL_PREPINNED_MEMORY_SIZE, 64, \
release(bool, AMD_CPU_AFFINITY, false, \
"Reset CPU affinity of any runtime threads") \
release(bool, ROC_USE_FGS_KERNARG, true, \
"Use fine grain kernel args segment for supported asics")
"Use fine grain kernel args segment for supported asics") \
release(bool, GPU_STREAMOPS_CP_WAIT, false, \
"Force the stream wait memory operation to wait on CP.")
namespace amd {