SWDEV-292525 - [vdi] Path to streamOps shaders
Implementation to use a blit kernel to perform a hipStreamWait/write instead of an AQL packet. Change-Id: I462671ed5cec37144dfe97ff66439249196117c1
此提交包含在:
+13
-1
@@ -34,6 +34,10 @@ const char* BlitSourceCode = BLIT_KERNELS(
|
||||
extern void __amd_fillBufferAligned(__global uchar*, __global ushort*, __global uint*, __global ulong*,
|
||||
__constant uchar*, uint, ulong, ulong);
|
||||
|
||||
extern void __amd_streamOpsWrite(__global uint*, __global ulong*, ulong, ulong);
|
||||
|
||||
extern void __amd_streamOpsWait(__global uint*,__global ulong*, ulong, ulong, ulong);
|
||||
|
||||
__kernel void __amd_rocclr_copyBufferRect(__global uchar* src, __global uchar* dst, ulong4 srcRect,
|
||||
ulong4 dstRect, ulong4 size) {
|
||||
__amd_copyBufferRect(src, dst, srcRect, dstRect, size);
|
||||
@@ -109,6 +113,14 @@ const char* BlitSourceCode = BLIT_KERNELS(
|
||||
int4 patternINT4, uint4 patternUINT4, int4 origin, int4 size,
|
||||
uint type) {
|
||||
__amd_fillImage(image, patternFLOAT4, patternINT4, patternUINT4, origin, size, type);
|
||||
});
|
||||
}
|
||||
|
||||
__kernel void __amd_rocclr_streamOpsWait(__global uint* ptrInt, __global ulong* ptrUlong, ulong value, ulong flags,
|
||||
ulong mask) {
|
||||
__amd_streamOpsWait(ptrInt, ptrUlong, value, flags, mask);
|
||||
}
|
||||
|
||||
__kernel void __amd_rocclr_streamOpsWrite(__global uint* ptrInt, __global ulong* ptrUlong, ulong value, ulong sizeBytes) {
|
||||
__amd_streamOpsWrite(ptrInt, ptrUlong, value, sizeBytes);
|
||||
});
|
||||
} // namespace device
|
||||
|
||||
@@ -2278,6 +2278,83 @@ bool KernelBlitManager::fillImage(device::Memory& memory, const void* pattern,
|
||||
return result;
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
bool KernelBlitManager::streamOpsWrite(device::Memory& memory, uint64_t value,
|
||||
size_t sizeBytes) const {
|
||||
amd::ScopedLock k(lockXferOps_);
|
||||
bool result = false;
|
||||
uint blitType = StreamOpsWrite;
|
||||
size_t dim = 1;
|
||||
size_t globalWorkOffset[1] = { 0 };
|
||||
size_t globalWorkSize[1] = { 1 };
|
||||
size_t localWorkSize[1] = { 1 };
|
||||
// Program kernels arguments for the write operation
|
||||
cl_mem mem = as_cl<amd::Memory>(memory.owner());
|
||||
bool is32BitWrite = (sizeBytes == sizeof(uint32_t)) ? true : false;
|
||||
// Program kernels arguments for the write operation
|
||||
if (is32BitWrite) {
|
||||
setArgument(kernels_[blitType], 0, sizeof(cl_mem), &mem);
|
||||
setArgument(kernels_[blitType], 1, sizeof(cl_mem), nullptr);
|
||||
setArgument(kernels_[blitType], 2, sizeof(uint32_t), &value);
|
||||
} else {
|
||||
setArgument(kernels_[blitType], 0, sizeof(cl_mem), nullptr);
|
||||
setArgument(kernels_[blitType], 1, sizeof(cl_mem), &mem);
|
||||
setArgument(kernels_[blitType], 2, sizeof(uint64_t), &value);
|
||||
}
|
||||
setArgument(kernels_[blitType], 3, sizeof(size_t), &sizeBytes);
|
||||
// Create ND range object for the kernel's execution
|
||||
amd::NDRangeContainer ndrange(dim, globalWorkOffset, globalWorkSize, localWorkSize);
|
||||
// Execute the blit
|
||||
address parameters = captureArguments(kernels_[blitType]);
|
||||
result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], parameters, nullptr);
|
||||
releaseArguments(parameters);
|
||||
synchronize();
|
||||
return result;
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
bool KernelBlitManager::streamOpsWait(device::Memory& memory, uint64_t value, size_t sizeBytes,
|
||||
uint64_t flags, uint64_t mask) const {
|
||||
amd::ScopedLock k(lockXferOps_);
|
||||
bool result = false;
|
||||
uint blitType = StreamOpsWait;
|
||||
size_t dim = 1;
|
||||
|
||||
size_t globalWorkOffset[1] = { 0 };
|
||||
size_t globalWorkSize[1] = { 1 };
|
||||
size_t localWorkSize[1] = { 1 };
|
||||
|
||||
// Program kernels arguments for the wait operation
|
||||
cl_mem mem = as_cl<amd::Memory>(memory.owner());
|
||||
bool is32BitWait = (sizeBytes == sizeof(uint32_t)) ? true : false;
|
||||
// Program kernels arguments for the wait operation
|
||||
if (is32BitWait) {
|
||||
setArgument(kernels_[blitType], 0, sizeof(cl_mem), &mem);
|
||||
setArgument(kernels_[blitType], 1, sizeof(cl_mem), nullptr);
|
||||
setArgument(kernels_[blitType], 2, sizeof(uint32_t), &value);
|
||||
setArgument(kernels_[blitType], 3, sizeof(uint32_t), &flags);
|
||||
setArgument(kernels_[blitType], 4, sizeof(uint32_t), &mask);
|
||||
} else {
|
||||
setArgument(kernels_[blitType], 0, sizeof(cl_mem), nullptr);
|
||||
setArgument(kernels_[blitType], 1, sizeof(cl_mem), &mem);
|
||||
setArgument(kernels_[blitType], 2, sizeof(uint64_t), &value);
|
||||
setArgument(kernels_[blitType], 3, sizeof(uint64_t), &flags);
|
||||
setArgument(kernels_[blitType], 4, sizeof(uint64_t), &mask);
|
||||
}
|
||||
|
||||
// Create ND range object for the kernel's execution
|
||||
amd::NDRangeContainer ndrange(dim, globalWorkOffset, globalWorkSize, localWorkSize);
|
||||
|
||||
// Execute the blit
|
||||
address parameters = captureArguments(kernels_[blitType]);
|
||||
result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], parameters, nullptr);
|
||||
releaseArguments(parameters);
|
||||
synchronize();
|
||||
|
||||
return result;
|
||||
}
|
||||
// ================================================================================================
|
||||
|
||||
amd::Memory* DmaBlitManager::pinHostMemory(const void* hostMem, size_t pinSize,
|
||||
size_t& partial) const {
|
||||
size_t pinAllocSize;
|
||||
|
||||
@@ -242,6 +242,8 @@ class KernelBlitManager : public DmaBlitManager {
|
||||
FillImage,
|
||||
Scheduler,
|
||||
GwsInit,
|
||||
StreamOpsWrite,
|
||||
StreamOpsWait,
|
||||
BlitTotal
|
||||
};
|
||||
|
||||
@@ -387,6 +389,20 @@ class KernelBlitManager : public DmaBlitManager {
|
||||
bool RunGwsInit(uint32_t value //!< Initial value for GWS resource
|
||||
) const;
|
||||
|
||||
//! Stream memory write operation - Write a 'value' at 'memory'.
|
||||
bool streamOpsWrite(device::Memory& memory, //!< Memory to write the 'value'
|
||||
uint64_t value,
|
||||
size_t sizeBytes
|
||||
) const;
|
||||
|
||||
//! Stream memory ops- Waits for a 'value' at 'memory' and wait is released based on compare op.
|
||||
bool streamOpsWait(device::Memory& memory, //!< Memory contents to compare the 'value' against
|
||||
uint64_t value,
|
||||
size_t sizeBytes,
|
||||
uint64_t flags,
|
||||
uint64_t mask
|
||||
) const;
|
||||
|
||||
virtual amd::Monitor* lockXfer() const { return &lockXferOps_; }
|
||||
|
||||
private:
|
||||
@@ -466,7 +482,7 @@ static const char* BlitName[KernelBlitManager::BlitTotal] = {
|
||||
"__amd_rocclr_copyBufferRectAligned", "__amd_rocclr_copyBuffer",
|
||||
"__amd_rocclr_copyBufferAligned", "__amd_rocclr_fillBufferAligned",
|
||||
"__amd_rocclr_fillImage", "__amd_rocclr_scheduler",
|
||||
"__amd_rocclr_gwsInit"
|
||||
"__amd_rocclr_gwsInit", "__amd_rocclr_streamOpsWrite", "__amd_rocclr_streamOpsWait"
|
||||
};
|
||||
|
||||
inline void KernelBlitManager::setArgument(amd::Kernel* kernel, size_t index,
|
||||
|
||||
@@ -2257,7 +2257,7 @@ void VirtualGPU::submitStreamOperation(amd::StreamOperationCommand& cmd) {
|
||||
profilingBegin(cmd);
|
||||
|
||||
const cl_command_type type = cmd.type();
|
||||
const int64_t value = cmd.value();
|
||||
const uint64_t value = cmd.value();
|
||||
const uint64_t mask = cmd.mask();
|
||||
const unsigned int flags = cmd.flags();
|
||||
const size_t sizeBytes = cmd.sizeBytes();
|
||||
@@ -2267,44 +2267,58 @@ void VirtualGPU::submitStreamOperation(amd::StreamOperationCommand& cmd) {
|
||||
Memory* memory = dev().getRocMemory(amdMemory);
|
||||
|
||||
if (type == ROCCLR_COMMAND_STREAM_WAIT_VALUE) {
|
||||
hsa_amd_barrier_value_packet_t aqlPacket;
|
||||
hsa_amd_vendor_packet_header_t header;
|
||||
hsa_signal_t signal;
|
||||
Buffer* buff = static_cast<Buffer*>(memory);
|
||||
if (GPU_STREAMOPS_CP_WAIT) {
|
||||
hsa_amd_barrier_value_packet_t aqlPacket;
|
||||
hsa_amd_vendor_packet_header_t header;
|
||||
hsa_signal_t signal;
|
||||
Buffer* buff = static_cast<Buffer*>(memory);
|
||||
|
||||
header.header = kBarrierVendorPacketHeader;
|
||||
header.AmdFormat = HSA_AMD_PACKET_TYPE_BARRIER_VALUE;
|
||||
aqlPacket.signal = buff->getSignal();
|
||||
aqlPacket.completion_signal = Barriers().ActiveSignal();
|
||||
header.header = kBarrierVendorPacketHeader;
|
||||
header.AmdFormat = HSA_AMD_PACKET_TYPE_BARRIER_VALUE;
|
||||
aqlPacket.signal = buff->getSignal();
|
||||
aqlPacket.completion_signal = Barriers().ActiveSignal();
|
||||
|
||||
// mask is always applied on value at signal before performing
|
||||
// the comparision defiend by 'condition'
|
||||
switch (flags) {
|
||||
case ROCCLR_STREAM_WAIT_VALUE_GTE:
|
||||
aqlPacket.value = value;
|
||||
aqlPacket.mask = mask;
|
||||
aqlPacket.cond = HSA_SIGNAL_CONDITION_GTE;
|
||||
break;
|
||||
case ROCCLR_STREAM_WAIT_VALUE_EQ:
|
||||
aqlPacket.value = value;
|
||||
aqlPacket.mask = mask;
|
||||
aqlPacket.cond = HSA_SIGNAL_CONDITION_EQ;
|
||||
break;
|
||||
case ROCCLR_STREAM_WAIT_VALUE_AND:
|
||||
aqlPacket.value = 0;
|
||||
aqlPacket.mask = (value & mask);
|
||||
aqlPacket.cond = HSA_SIGNAL_CONDITION_NE;
|
||||
break;
|
||||
case ROCCLR_STREAM_WAIT_VALUE_NOR:
|
||||
aqlPacket.value = ~value & mask;
|
||||
aqlPacket.mask = ~value & mask;
|
||||
aqlPacket.cond = HSA_SIGNAL_CONDITION_NE;
|
||||
break;
|
||||
default:
|
||||
ShouldNotReachHere();
|
||||
break;
|
||||
// mask is always applied on value at signal before performing
|
||||
// the comparision defiend by 'condition'
|
||||
switch (flags) {
|
||||
case ROCCLR_STREAM_WAIT_VALUE_GTE:
|
||||
aqlPacket.value = value;
|
||||
aqlPacket.mask = mask;
|
||||
aqlPacket.cond = HSA_SIGNAL_CONDITION_GTE;
|
||||
break;
|
||||
case ROCCLR_STREAM_WAIT_VALUE_EQ:
|
||||
aqlPacket.value = value;
|
||||
aqlPacket.mask = mask;
|
||||
aqlPacket.cond = HSA_SIGNAL_CONDITION_EQ;
|
||||
break;
|
||||
case ROCCLR_STREAM_WAIT_VALUE_AND:
|
||||
aqlPacket.value = 0;
|
||||
aqlPacket.mask = (value & mask);
|
||||
aqlPacket.cond = HSA_SIGNAL_CONDITION_NE;
|
||||
break;
|
||||
case ROCCLR_STREAM_WAIT_VALUE_NOR:
|
||||
aqlPacket.value = ~value & mask;
|
||||
aqlPacket.mask = ~value & mask;
|
||||
aqlPacket.cond = HSA_SIGNAL_CONDITION_NE;
|
||||
break;
|
||||
default:
|
||||
ShouldNotReachHere();
|
||||
break;
|
||||
}
|
||||
dispatchBarrierValuePacket(&aqlPacket, header);
|
||||
}
|
||||
// Use a blit kernel to perform the wait operation
|
||||
else {
|
||||
// mask is applied on value before performing
|
||||
// the comparision defined by 'condition'
|
||||
bool result = static_cast<KernelBlitManager&>(blitMgr()).streamOpsWait(*memory, value,
|
||||
sizeBytes, flags, mask);
|
||||
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "Waiting for value: 0x%lx."
|
||||
" Flags: 0x%lx mask: 0x%lx", value, flags, mask);
|
||||
if (!result) {
|
||||
LogError("submitStreamOperation: Wait failed!");
|
||||
}
|
||||
}
|
||||
dispatchBarrierValuePacket(&aqlPacket, header);
|
||||
} else if (type == ROCCLR_COMMAND_STREAM_WRITE_VALUE) {
|
||||
amd::Coord3D origin(offset);
|
||||
amd::Coord3D size(sizeBytes);
|
||||
@@ -2312,10 +2326,12 @@ void VirtualGPU::submitStreamOperation(amd::StreamOperationCommand& cmd) {
|
||||
// Ensure memory ordering preceding the write
|
||||
dispatchBarrierPacket(kBarrierPacketReleaseHeader);
|
||||
|
||||
if (!fillMemory(CL_COMMAND_FILL_BUFFER, amdMemory, &value, sizeBytes, origin, size, true)) {
|
||||
cmd.setStatus(CL_INVALID_OPERATION);
|
||||
}
|
||||
bool result = static_cast<KernelBlitManager&>(blitMgr()).streamOpsWrite(*memory, value,
|
||||
sizeBytes);
|
||||
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "Writing value: 0x%lx", value);
|
||||
if (!result) {
|
||||
LogError("submitStreamOperation: Write failed!");
|
||||
}
|
||||
} else {
|
||||
ShouldNotReachHere();
|
||||
}
|
||||
|
||||
@@ -666,9 +666,10 @@ class StreamOperationCommand : public OneMemoryArgCommand {
|
||||
offset_(offset),
|
||||
sizeBytes_(sizeBytes) {
|
||||
// Sanity check
|
||||
assert((cmdType == ROCCLR_COMMAND_STREAM_WRITE_VALUE ||
|
||||
(cmdType == ROCCLR_COMMAND_STREAM_WAIT_VALUE &&
|
||||
memory_->getMemFlags() & ROCCLR_MEM_HSA_SIGNAL_MEMORY)) &&
|
||||
assert((cmdType == ROCCLR_COMMAND_STREAM_WRITE_VALUE) ||
|
||||
(cmdType == ROCCLR_COMMAND_STREAM_WAIT_VALUE) ||
|
||||
(cmdType == ROCCLR_COMMAND_STREAM_WAIT_VALUE && GPU_STREAMOPS_CP_WAIT &&
|
||||
(memory_->getMemFlags() & ROCCLR_MEM_HSA_SIGNAL_MEMORY)) &&
|
||||
"Invalid Stream Operation");
|
||||
}
|
||||
|
||||
|
||||
+3
-1
@@ -263,7 +263,9 @@ release(size_t, PAL_PREPINNED_MEMORY_SIZE, 64, \
|
||||
release(bool, AMD_CPU_AFFINITY, false, \
|
||||
"Reset CPU affinity of any runtime threads") \
|
||||
release(bool, ROC_USE_FGS_KERNARG, true, \
|
||||
"Use fine grain kernel args segment for supported asics")
|
||||
"Use fine grain kernel args segment for supported asics") \
|
||||
release(bool, GPU_STREAMOPS_CP_WAIT, false, \
|
||||
"Force the stream wait memory operation to wait on CP.")
|
||||
|
||||
namespace amd {
|
||||
|
||||
|
||||
新增問題並參考
封鎖使用者