SWDEV-347670 - GPU StreamWait and StreamWrite support in Windows PAL backend

Change-Id: Ic4881305b6332e217f3d3127dce7e9d9d0a7df11


[ROCm/clr commit: 373a7d1195]
This commit is contained in:
Ajay
2022-07-21 18:05:05 +00:00
کامیت شده توسط Ajay GunaShekar
والد 46df61f614
کامیت c2a7fe7bd6
9فایلهای تغییر یافته به همراه249 افزوده شده و 9 حذف شده
@@ -193,6 +193,24 @@ class BlitManager : public amd::HeapObject {
bool entire = false //!< Entire buffer will be updated
) const = 0;
//! Stream memory write operation - Write a 'value' at 'memory'.
virtual bool streamOpsWrite(device::Memory& memory, //!< Memory to write the 'value'
uint64_t value,
size_t offset,
size_t sizeBytes
) const = 0;
//! Stream memory ops- Waits for a 'value' at 'memory' and wait is released based on compare op.
virtual bool streamOpsWait(device::Memory& memory, //!< Memory contents to compare the 'value' against
uint64_t value,
size_t offset,
size_t sizeBytes,
uint64_t flags,
uint64_t mask
) const = 0;
//! Enables synchronization on blit operations
void enableSynchronization() { syncOperation_ = true; }
@@ -983,7 +983,9 @@ void CalcRowSlicePitches(uint64_t* pitch, const int32_t* copySize, size_t rowPit
}
}
static void setArgument(amd::Kernel* kernel, size_t index, size_t size, const void* value) {
inline void KernelBlitManager::setArgument(amd::Kernel* kernel, size_t index, size_t size,
const void* value, size_t offset,
const device::Memory* dev_mem, bool writeVAImmediate) const {
const amd::KernelParameterDescriptor& desc = kernel->signature().at(index);
void* param = kernel->parameters().values() + desc.offset_;
@@ -1003,7 +1005,7 @@ static void setArgument(amd::Kernel* kernel, size_t index, size_t size, const vo
} else {
// convert cl_mem to amd::Memory*, return false if invalid.
LP64_SWITCH(uint32_value, uint64_value) =
static_cast<uintptr_t>((*static_cast<Memory* const*>(value))->virtualAddress());
static_cast<uintptr_t>((*static_cast<Memory* const*>(value))->virtualAddress()) + offset;
reinterpret_cast<Memory**>(kernel->parameters().values() +
kernel->parameters().memoryObjOffset())[desc.info_.arrayIndex_] =
*static_cast<Memory* const*>(value);
@@ -1036,7 +1038,7 @@ static void setArgument(amd::Kernel* kernel, size_t index, size_t size, const vo
break;
}
switch (argSize) {
switch (desc.size_) {
case sizeof(uint32_t):
*static_cast<uint32_t*>(param) = uint32_value;
break;
@@ -2426,6 +2428,85 @@ bool KernelBlitManager::fillImage(device::Memory& memory, const void* pattern,
return result;
}
// ================================================================================================
bool KernelBlitManager::streamOpsWrite(device::Memory& memory, uint64_t value,
size_t offset, size_t sizeBytes) const {
amd::ScopedLock k(lockXferOps_);
bool result = false;
uint blitType = StreamOpsWrite;
size_t dim = 1;
size_t globalWorkOffset[1] = { 0 };
size_t globalWorkSize[1] = { 1 };
size_t localWorkSize[1] = { 1 };
// Program kernels arguments for the write operation
Memory* mem = &gpuMem(memory);
bool is32BitWrite = (sizeBytes == sizeof(uint32_t)) ? true : false;
// Program kernels arguments for the write operation
if (is32BitWrite) {
setArgument(kernels_[blitType], 0, sizeof(cl_mem), &mem, offset);
setArgument(kernels_[blitType], 1, sizeof(cl_mem), nullptr);
setArgument(kernels_[blitType], 2, sizeof(uint32_t), &value);
} else {
setArgument(kernels_[blitType], 0, sizeof(cl_mem), nullptr);
setArgument(kernels_[blitType], 1, sizeof(cl_mem), &mem, offset);
setArgument(kernels_[blitType], 2, sizeof(uint64_t), &value);
}
setArgument(kernels_[blitType], 3, sizeof(size_t), &sizeBytes);
// Create ND range object for the kernel's execution
amd::NDRangeContainer ndrange(dim, globalWorkOffset, globalWorkSize, localWorkSize);
// Execute the blit
address parameters = kernels_[blitType]->parameters().values();
result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], parameters);
synchronize();
return result;
}
// ================================================================================================
bool KernelBlitManager::streamOpsWait(device::Memory& memory, uint64_t value, size_t offset,
size_t sizeBytes, uint64_t flags, uint64_t mask) const {
amd::ScopedLock k(lockXferOps_);
bool result = false;
uint blitType = StreamOpsWait;
size_t dim = 1;
size_t globalWorkOffset[1] = { 0 };
size_t globalWorkSize[1] = { 1 };
size_t localWorkSize[1] = { 1 };
// Program kernels arguments for the wait operation
Memory* mem = &gpuMem(memory);
bool is32BitWait = (sizeBytes == sizeof(uint32_t)) ? true : false;
// Program kernels arguments for the wait operation
if (is32BitWait) {
setArgument(kernels_[blitType], 0, sizeof(cl_mem), &mem, offset);
setArgument(kernels_[blitType], 1, sizeof(cl_mem), nullptr);
setArgument(kernels_[blitType], 2, sizeof(uint32_t), &value);
setArgument(kernels_[blitType], 3, sizeof(uint32_t), &flags);
setArgument(kernels_[blitType], 4, sizeof(uint32_t), &mask);
} else {
setArgument(kernels_[blitType], 0, sizeof(cl_mem), nullptr);
setArgument(kernels_[blitType], 1, sizeof(cl_mem), &mem, offset);
setArgument(kernels_[blitType], 2, sizeof(uint64_t), &value);
setArgument(kernels_[blitType], 3, sizeof(uint64_t), &flags);
setArgument(kernels_[blitType], 4, sizeof(uint64_t), &mask);
}
// Create ND range object for the kernel's execution
amd::NDRangeContainer ndrange(dim, globalWorkOffset, globalWorkSize, localWorkSize);
// Execute the blit
address parameters = kernels_[blitType]->parameters().values();
result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], parameters);
synchronize();
return result;
}
bool KernelBlitManager::runScheduler(device::Memory& vqueue, device::Memory& params, uint paramIdx,
uint threads) const {
amd::ScopedLock k(lockXferOps_);
@@ -155,6 +155,30 @@ class DmaBlitManager : public device::HostBlitManager {
bool entire = false //!< Entire buffer will be updated
) const;
//! Stream memory write operation - Write a 'value' at 'memory'.
virtual bool streamOpsWrite(device::Memory& memory, //!< Memory to write the 'value'
uint64_t value,
size_t offset,
size_t sizeBytes
) const {
assert(!"Unimplemented");
return false;
};
//! Stream memory ops- Waits for a 'value' at 'memory' and wait is released based on compare op.
virtual bool streamOpsWait(device::Memory& memory, //!< Memory contents to compare the 'value' against
uint64_t value,
size_t offset,
size_t sizeBytes,
uint64_t flags,
uint64_t mask
) const {
assert(!"Unimplemented");
return false;
};
protected:
static constexpr uint MaxPinnedBuffers = 4;
@@ -223,7 +247,9 @@ class KernelBlitManager : public DmaBlitManager {
FillImage,
Scheduler,
GwsInit,
BlitTotal
StreamOpsWrite,
StreamOpsWait,
BlitTotal,
};
//! Constructor
@@ -378,6 +404,23 @@ class KernelBlitManager : public DmaBlitManager {
virtual amd::Monitor* lockXfer() const { return &lockXferOps_; }
//! Stream memory write operation - Write a 'value' at 'memory'.
virtual bool streamOpsWrite(device::Memory& memory, //!< Memory to write the 'value'
uint64_t value,
size_t offset,
size_t sizeBytes
) const;
//! Stream memory ops- Waits for a 'value' at 'memory' and wait is released based on compare op.
virtual bool streamOpsWait(device::Memory& memory, //!< Memory contents to compare the 'value' against
uint64_t value,
size_t offset,
size_t sizeBytes,
uint64_t flags,
uint64_t mask
) const;
private:
static constexpr size_t MaxXferBuffers = 2;
static constexpr uint TransferSplitSize = 3;
@@ -408,6 +451,12 @@ class KernelBlitManager : public DmaBlitManager {
bool createProgram(Device& device //!< Device object
);
inline void setArgument(amd::Kernel* kernel, size_t index,
size_t size, const void* value, size_t offset = 0,
const device::Memory* dev_mem = nullptr,
bool writeVAImmediate = false) const;
//! Creates a view memory object
Memory* createView(const Memory& parent, //!< Parent memory object
const cl_image_format format //!< The new format for a view
@@ -432,7 +481,8 @@ static const char* BlitName[KernelBlitManager::BlitTotal] = {
"__amd_rocclr_copyBufferRectAligned", "__amd_rocclr_copyBuffer",
"__amd_rocclr_copyBufferAligned", "__amd_rocclr_fillBufferAligned",
"__amd_rocclr_fillImage", "__amd_rocclr_scheduler",
"__amd_rocclr_gwsInit"
"__amd_rocclr_gwsInit", "__amd_rocclr_streamOpsWrite",
"__amd_rocclr_streamOpsWait"
};
/*@}*/ // namespace pal
@@ -22,6 +22,25 @@ namespace pal {
#define BLIT_KERNEL(...) #__VA_ARGS__
const char* palBlitLinearSourceCode = BLIT_KERNEL(
\n
extern void __amd_streamOpsWrite(__global uint*, __global ulong*, ulong, ulong);
\n
extern void __amd_streamOpsWait(__global uint*,__global ulong*, ulong, ulong, ulong);
\n
__kernel void __amd_rocclr_streamOpsWrite(__global uint* ptrInt, __global ulong* ptrUlong,
ulong value, ulong sizeBytes) {
__amd_streamOpsWrite(ptrInt, ptrUlong, value, sizeBytes);
}
\n
__kernel void __amd_rocclr_streamOpsWait(__global uint* ptrInt, __global ulong* ptrUlong,
ulong value, ulong flags, ulong mask) {
__amd_streamOpsWait(ptrInt, ptrUlong, value, flags, mask);
}
\n);
const char* SchedulerSourceCode = BLIT_KERNEL(
\n
extern void __amd_scheduler(__global void*, __global void*, uint);
@@ -821,6 +821,8 @@ Device::~Device() {
extern const char* SchedulerSourceCode;
extern const char* SchedulerSourceCode20;
extern const char* GwsInitSourceCode;
extern const char* palBlitLinearSourceCode;
Pal::IDevice* gDeviceList[Pal::MaxDevices] = {};
uint32_t gStartDevice = 0;
uint32_t gNumDevices = 0;
@@ -2469,8 +2471,9 @@ bool Device::createBlitProgram() {
std::string extraBlits;
std::string ocl20;
if (amd::IS_HIP) {
extraBlits = palBlitLinearSourceCode;
if (info().cooperativeGroups_) {
extraBlits = GwsInitSourceCode;
extraBlits.append(GwsInitSourceCode);
}
}
else {
@@ -2134,6 +2134,48 @@ void VirtualGPU::submitSvmFreeMemory(amd::SvmFreeMemoryCommand& vcmd) {
profilingEnd(vcmd);
}
void VirtualGPU::submitStreamOperation(amd::StreamOperationCommand& cmd) {
// Make sure VirtualGPU has an exclusive access to the resources
amd::ScopedLock lock(execution());
profilingBegin(cmd);
const cl_command_type type = cmd.type();
const uint64_t value = cmd.value();
const uint64_t mask = cmd.mask();
const unsigned int flags = cmd.flags();
const size_t sizeBytes = cmd.sizeBytes();
const size_t offset = cmd.offset();
amd::Memory* amdMemory = &cmd.memory();
Memory* memory = dev().getGpuMemory(amdMemory);
if (type == ROCCLR_COMMAND_STREAM_WAIT_VALUE) {
// Use a blit kernel to perform the wait operation
// mask is applied on value before performing
// the comparision defined by 'condition'
bool result = static_cast<KernelBlitManager&>(blitMgr()).streamOpsWait(*memory, value, offset,
sizeBytes, flags, mask);
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "Waiting for value: 0x%lx."
" Flags: 0x%lx mask: 0x%lx", value, flags, mask);
if (!result) {
LogError("submitStreamOperation: Wait failed!");
}
} else if (type == ROCCLR_COMMAND_STREAM_WRITE_VALUE) {
bool result = static_cast<KernelBlitManager&>(blitMgr()).streamOpsWrite(*memory, value,
offset, sizeBytes);
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "Writing value: 0x%lx", value);
if (!result) {
LogError("submitStreamOperation: Write failed!");
}
} else {
ShouldNotReachHere();
}
profilingEnd(cmd);
}
void VirtualGPU::submitVirtualMap(amd::VirtualMapCommand& vcmd) {
// Make sure VirtualGPU has an exclusive access to the resources
amd::ScopedLock lock(execution());
@@ -349,7 +349,7 @@ class VirtualGPU : public device::VirtualDevice {
virtual void submitSvmUnmapMemory(amd::SvmUnmapMemoryCommand& cmd);
virtual void submitTransferBufferFromFile(amd::TransferBufferFileCommand& cmd);
virtual void submitVirtualMap(amd::VirtualMapCommand& cmd);
virtual void submitStreamOperation(amd::StreamOperationCommand& cmd);
void submitExternalSemaphoreCmd(amd::ExternalSemaphoreCmd& cmd);
void releaseMemory(GpuMemoryReference* mem);
@@ -157,6 +157,30 @@ class DmaBlitManager : public device::HostBlitManager {
bool entire = false //!< Entire buffer will be updated
) const;
//! Stream memory write operation - Write a 'value' at 'memory'.
virtual bool streamOpsWrite(device::Memory& memory, //!< Memory to write the 'value'
uint64_t value,
size_t offset,
size_t sizeBytes
) const {
assert(!"Unimplemented");
return false;
}
//! Stream memory ops- Waits for a 'value' at 'memory' and wait is released based on compare op.
virtual bool streamOpsWait(device::Memory& memory, //!< Memory contents to compare the 'value' against
uint64_t value,
size_t offset,
size_t sizeBytes,
uint64_t flags,
uint64_t mask
) const {
assert(!"Unimplemented");
return false;
}
protected:
static constexpr uint MaxPinnedBuffers = 4;
static constexpr size_t kMaxH2dMemcpySize = 8 * Ki;
@@ -427,14 +451,14 @@ class KernelBlitManager : public DmaBlitManager {
) const;
//! Stream memory write operation - Write a 'value' at 'memory'.
bool streamOpsWrite(device::Memory& memory, //!< Memory to write the 'value'
virtual bool streamOpsWrite(device::Memory& memory, //!< Memory to write the 'value'
uint64_t value,
size_t offset,
size_t sizeBytes
) const;
//! Stream memory ops- Waits for a 'value' at 'memory' and wait is released based on compare op.
bool streamOpsWait(device::Memory& memory, //!< Memory contents to compare the 'value' against
virtual bool streamOpsWait(device::Memory& memory, //!< Memory contents to compare the 'value' against
uint64_t value,
size_t offset,
size_t sizeBytes,
@@ -19,6 +19,7 @@
THE SOFTWARE. */
#include "platform/activity.hpp"
#include "platform/command_utils.hpp"
ACTIVITY_PROF_INSTANCES();
@@ -59,6 +60,8 @@ const char* getOclCommandKindString(uint32_t op) {
CASE_STRING(CL_COMMAND_SVM_MEMFILL, SvmMemFill)
CASE_STRING(CL_COMMAND_SVM_MAP, SvmMap)
CASE_STRING(CL_COMMAND_SVM_UNMAP, SvmUnmap)
CASE_STRING(ROCCLR_COMMAND_STREAM_WAIT_VALUE, StreamWait)
CASE_STRING(ROCCLR_COMMAND_STREAM_WRITE_VALUE, StreamWrite)
default: case_string = "Unknown command type";
};
return case_string;