From c2a7fe7bd6947f80bff6c8d11fa04221ab281b14 Mon Sep 17 00:00:00 2001 From: Ajay Date: Thu, 21 Jul 2022 18:05:05 +0000 Subject: [PATCH] SWDEV-347670 - GPU StreamWait and StreamWrite support in Windows PAL backend Change-Id: Ic4881305b6332e217f3d3127dce7e9d9d0a7df11 [ROCm/clr commit: 373a7d1195f6bdf9784ef00eed78a4bddc1fe20d] --- projects/clr/rocclr/device/blit.hpp | 18 ++++ projects/clr/rocclr/device/pal/palblit.cpp | 87 ++++++++++++++++++- projects/clr/rocclr/device/pal/palblit.hpp | 54 +++++++++++- projects/clr/rocclr/device/pal/palblitcl.cpp | 19 ++++ projects/clr/rocclr/device/pal/paldevice.cpp | 5 +- projects/clr/rocclr/device/pal/palvirtual.cpp | 42 +++++++++ projects/clr/rocclr/device/pal/palvirtual.hpp | 2 +- projects/clr/rocclr/device/rocm/rocblit.hpp | 28 +++++- projects/clr/rocclr/platform/activity.cpp | 3 + 9 files changed, 249 insertions(+), 9 deletions(-) diff --git a/projects/clr/rocclr/device/blit.hpp b/projects/clr/rocclr/device/blit.hpp index 794bfa15ca..9c7d96f016 100644 --- a/projects/clr/rocclr/device/blit.hpp +++ b/projects/clr/rocclr/device/blit.hpp @@ -193,6 +193,24 @@ class BlitManager : public amd::HeapObject { bool entire = false //!< Entire buffer will be updated ) const = 0; + + //! Stream memory write operation - Write a 'value' at 'memory'. + virtual bool streamOpsWrite(device::Memory& memory, //!< Memory to write the 'value' + uint64_t value, + size_t offset, + size_t sizeBytes + ) const = 0; + + + //! Stream memory ops- Waits for a 'value' at 'memory' and wait is released based on compare op. + virtual bool streamOpsWait(device::Memory& memory, //!< Memory contents to compare the 'value' against + uint64_t value, + size_t offset, + size_t sizeBytes, + uint64_t flags, + uint64_t mask + ) const = 0; + //! Enables synchronization on blit operations void enableSynchronization() { syncOperation_ = true; } diff --git a/projects/clr/rocclr/device/pal/palblit.cpp b/projects/clr/rocclr/device/pal/palblit.cpp index 0f2e842703..3d802ed4f7 100644 --- a/projects/clr/rocclr/device/pal/palblit.cpp +++ b/projects/clr/rocclr/device/pal/palblit.cpp @@ -983,7 +983,9 @@ void CalcRowSlicePitches(uint64_t* pitch, const int32_t* copySize, size_t rowPit } } -static void setArgument(amd::Kernel* kernel, size_t index, size_t size, const void* value) { +inline void KernelBlitManager::setArgument(amd::Kernel* kernel, size_t index, size_t size, + const void* value, size_t offset, + const device::Memory* dev_mem, bool writeVAImmediate) const { const amd::KernelParameterDescriptor& desc = kernel->signature().at(index); void* param = kernel->parameters().values() + desc.offset_; @@ -1003,7 +1005,7 @@ static void setArgument(amd::Kernel* kernel, size_t index, size_t size, const vo } else { // convert cl_mem to amd::Memory*, return false if invalid. LP64_SWITCH(uint32_value, uint64_value) = - static_cast((*static_cast(value))->virtualAddress()); + static_cast((*static_cast(value))->virtualAddress()) + offset; reinterpret_cast(kernel->parameters().values() + kernel->parameters().memoryObjOffset())[desc.info_.arrayIndex_] = *static_cast(value); @@ -1036,7 +1038,7 @@ static void setArgument(amd::Kernel* kernel, size_t index, size_t size, const vo break; } - switch (argSize) { + switch (desc.size_) { case sizeof(uint32_t): *static_cast(param) = uint32_value; break; @@ -2426,6 +2428,85 @@ bool KernelBlitManager::fillImage(device::Memory& memory, const void* pattern, return result; } +// ================================================================================================ +bool KernelBlitManager::streamOpsWrite(device::Memory& memory, uint64_t value, + size_t offset, size_t sizeBytes) const { + amd::ScopedLock k(lockXferOps_); + bool result = false; + uint blitType = StreamOpsWrite; + size_t dim = 1; + size_t globalWorkOffset[1] = { 0 }; + size_t globalWorkSize[1] = { 1 }; + size_t localWorkSize[1] = { 1 }; + // Program kernels arguments for the write operation + Memory* mem = &gpuMem(memory); + bool is32BitWrite = (sizeBytes == sizeof(uint32_t)) ? true : false; + // Program kernels arguments for the write operation + if (is32BitWrite) { + setArgument(kernels_[blitType], 0, sizeof(cl_mem), &mem, offset); + setArgument(kernels_[blitType], 1, sizeof(cl_mem), nullptr); + setArgument(kernels_[blitType], 2, sizeof(uint32_t), &value); + } else { + setArgument(kernels_[blitType], 0, sizeof(cl_mem), nullptr); + setArgument(kernels_[blitType], 1, sizeof(cl_mem), &mem, offset); + setArgument(kernels_[blitType], 2, sizeof(uint64_t), &value); + } + setArgument(kernels_[blitType], 3, sizeof(size_t), &sizeBytes); + // Create ND range object for the kernel's execution + amd::NDRangeContainer ndrange(dim, globalWorkOffset, globalWorkSize, localWorkSize); + // Execute the blit + address parameters = kernels_[blitType]->parameters().values(); + result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], parameters); + synchronize(); + return result; +} + +// ================================================================================================ +bool KernelBlitManager::streamOpsWait(device::Memory& memory, uint64_t value, size_t offset, + size_t sizeBytes, uint64_t flags, uint64_t mask) const { + amd::ScopedLock k(lockXferOps_); + bool result = false; + uint blitType = StreamOpsWait; + size_t dim = 1; + + size_t globalWorkOffset[1] = { 0 }; + size_t globalWorkSize[1] = { 1 }; + size_t localWorkSize[1] = { 1 }; + + // Program kernels arguments for the wait operation + Memory* mem = &gpuMem(memory); + bool is32BitWait = (sizeBytes == sizeof(uint32_t)) ? true : false; + // Program kernels arguments for the wait operation + if (is32BitWait) { + setArgument(kernels_[blitType], 0, sizeof(cl_mem), &mem, offset); + setArgument(kernels_[blitType], 1, sizeof(cl_mem), nullptr); + setArgument(kernels_[blitType], 2, sizeof(uint32_t), &value); + setArgument(kernels_[blitType], 3, sizeof(uint32_t), &flags); + setArgument(kernels_[blitType], 4, sizeof(uint32_t), &mask); + } else { + setArgument(kernels_[blitType], 0, sizeof(cl_mem), nullptr); + setArgument(kernels_[blitType], 1, sizeof(cl_mem), &mem, offset); + setArgument(kernels_[blitType], 2, sizeof(uint64_t), &value); + setArgument(kernels_[blitType], 3, sizeof(uint64_t), &flags); + setArgument(kernels_[blitType], 4, sizeof(uint64_t), &mask); + } + + // Create ND range object for the kernel's execution + amd::NDRangeContainer ndrange(dim, globalWorkOffset, globalWorkSize, localWorkSize); + + // Execute the blit + address parameters = kernels_[blitType]->parameters().values(); + result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], parameters); + synchronize(); + + return result; +} + + + + + + bool KernelBlitManager::runScheduler(device::Memory& vqueue, device::Memory& params, uint paramIdx, uint threads) const { amd::ScopedLock k(lockXferOps_); diff --git a/projects/clr/rocclr/device/pal/palblit.hpp b/projects/clr/rocclr/device/pal/palblit.hpp index b31600eb06..260c9ca257 100644 --- a/projects/clr/rocclr/device/pal/palblit.hpp +++ b/projects/clr/rocclr/device/pal/palblit.hpp @@ -155,6 +155,30 @@ class DmaBlitManager : public device::HostBlitManager { bool entire = false //!< Entire buffer will be updated ) const; + //! Stream memory write operation - Write a 'value' at 'memory'. + virtual bool streamOpsWrite(device::Memory& memory, //!< Memory to write the 'value' + uint64_t value, + size_t offset, + size_t sizeBytes + ) const { + assert(!"Unimplemented"); + return false; + }; + + //! Stream memory ops- Waits for a 'value' at 'memory' and wait is released based on compare op. + virtual bool streamOpsWait(device::Memory& memory, //!< Memory contents to compare the 'value' against + uint64_t value, + size_t offset, + size_t sizeBytes, + uint64_t flags, + uint64_t mask + ) const { + assert(!"Unimplemented"); + return false; + }; + + + protected: static constexpr uint MaxPinnedBuffers = 4; @@ -223,7 +247,9 @@ class KernelBlitManager : public DmaBlitManager { FillImage, Scheduler, GwsInit, - BlitTotal + StreamOpsWrite, + StreamOpsWait, + BlitTotal, }; //! Constructor @@ -378,6 +404,23 @@ class KernelBlitManager : public DmaBlitManager { virtual amd::Monitor* lockXfer() const { return &lockXferOps_; } + //! Stream memory write operation - Write a 'value' at 'memory'. + virtual bool streamOpsWrite(device::Memory& memory, //!< Memory to write the 'value' + uint64_t value, + size_t offset, + size_t sizeBytes + ) const; + + //! Stream memory ops- Waits for a 'value' at 'memory' and wait is released based on compare op. + virtual bool streamOpsWait(device::Memory& memory, //!< Memory contents to compare the 'value' against + uint64_t value, + size_t offset, + size_t sizeBytes, + uint64_t flags, + uint64_t mask + ) const; + + private: static constexpr size_t MaxXferBuffers = 2; static constexpr uint TransferSplitSize = 3; @@ -408,6 +451,12 @@ class KernelBlitManager : public DmaBlitManager { bool createProgram(Device& device //!< Device object ); + inline void setArgument(amd::Kernel* kernel, size_t index, + size_t size, const void* value, size_t offset = 0, + const device::Memory* dev_mem = nullptr, + bool writeVAImmediate = false) const; + + //! Creates a view memory object Memory* createView(const Memory& parent, //!< Parent memory object const cl_image_format format //!< The new format for a view @@ -432,7 +481,8 @@ static const char* BlitName[KernelBlitManager::BlitTotal] = { "__amd_rocclr_copyBufferRectAligned", "__amd_rocclr_copyBuffer", "__amd_rocclr_copyBufferAligned", "__amd_rocclr_fillBufferAligned", "__amd_rocclr_fillImage", "__amd_rocclr_scheduler", - "__amd_rocclr_gwsInit" + "__amd_rocclr_gwsInit", "__amd_rocclr_streamOpsWrite", + "__amd_rocclr_streamOpsWait" }; /*@}*/ // namespace pal diff --git a/projects/clr/rocclr/device/pal/palblitcl.cpp b/projects/clr/rocclr/device/pal/palblitcl.cpp index 954fac9034..862919c85a 100644 --- a/projects/clr/rocclr/device/pal/palblitcl.cpp +++ b/projects/clr/rocclr/device/pal/palblitcl.cpp @@ -22,6 +22,25 @@ namespace pal { #define BLIT_KERNEL(...) #__VA_ARGS__ +const char* palBlitLinearSourceCode = BLIT_KERNEL( +\n +extern void __amd_streamOpsWrite(__global uint*, __global ulong*, ulong, ulong); +\n +extern void __amd_streamOpsWait(__global uint*,__global ulong*, ulong, ulong, ulong); +\n +__kernel void __amd_rocclr_streamOpsWrite(__global uint* ptrInt, __global ulong* ptrUlong, + ulong value, ulong sizeBytes) { + __amd_streamOpsWrite(ptrInt, ptrUlong, value, sizeBytes); +} +\n +__kernel void __amd_rocclr_streamOpsWait(__global uint* ptrInt, __global ulong* ptrUlong, + ulong value, ulong flags, ulong mask) { + __amd_streamOpsWait(ptrInt, ptrUlong, value, flags, mask); +} +\n); + + + const char* SchedulerSourceCode = BLIT_KERNEL( \n extern void __amd_scheduler(__global void*, __global void*, uint); diff --git a/projects/clr/rocclr/device/pal/paldevice.cpp b/projects/clr/rocclr/device/pal/paldevice.cpp index 36b79ae97f..81ba65e74d 100644 --- a/projects/clr/rocclr/device/pal/paldevice.cpp +++ b/projects/clr/rocclr/device/pal/paldevice.cpp @@ -821,6 +821,8 @@ Device::~Device() { extern const char* SchedulerSourceCode; extern const char* SchedulerSourceCode20; extern const char* GwsInitSourceCode; +extern const char* palBlitLinearSourceCode; + Pal::IDevice* gDeviceList[Pal::MaxDevices] = {}; uint32_t gStartDevice = 0; uint32_t gNumDevices = 0; @@ -2469,8 +2471,9 @@ bool Device::createBlitProgram() { std::string extraBlits; std::string ocl20; if (amd::IS_HIP) { + extraBlits = palBlitLinearSourceCode; if (info().cooperativeGroups_) { - extraBlits = GwsInitSourceCode; + extraBlits.append(GwsInitSourceCode); } } else { diff --git a/projects/clr/rocclr/device/pal/palvirtual.cpp b/projects/clr/rocclr/device/pal/palvirtual.cpp index dfbd198b4b..d2827e40b8 100644 --- a/projects/clr/rocclr/device/pal/palvirtual.cpp +++ b/projects/clr/rocclr/device/pal/palvirtual.cpp @@ -2134,6 +2134,48 @@ void VirtualGPU::submitSvmFreeMemory(amd::SvmFreeMemoryCommand& vcmd) { profilingEnd(vcmd); } +void VirtualGPU::submitStreamOperation(amd::StreamOperationCommand& cmd) { + // Make sure VirtualGPU has an exclusive access to the resources + amd::ScopedLock lock(execution()); + profilingBegin(cmd); + + const cl_command_type type = cmd.type(); + const uint64_t value = cmd.value(); + const uint64_t mask = cmd.mask(); + const unsigned int flags = cmd.flags(); + const size_t sizeBytes = cmd.sizeBytes(); + const size_t offset = cmd.offset(); + + amd::Memory* amdMemory = &cmd.memory(); + Memory* memory = dev().getGpuMemory(amdMemory); + + if (type == ROCCLR_COMMAND_STREAM_WAIT_VALUE) { + + // Use a blit kernel to perform the wait operation + // mask is applied on value before performing + // the comparision defined by 'condition' + bool result = static_cast(blitMgr()).streamOpsWait(*memory, value, offset, + sizeBytes, flags, mask); + ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "Waiting for value: 0x%lx." + " Flags: 0x%lx mask: 0x%lx", value, flags, mask); + if (!result) { + LogError("submitStreamOperation: Wait failed!"); + } + } else if (type == ROCCLR_COMMAND_STREAM_WRITE_VALUE) { + bool result = static_cast(blitMgr()).streamOpsWrite(*memory, value, + offset, sizeBytes); + ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "Writing value: 0x%lx", value); + if (!result) { + LogError("submitStreamOperation: Write failed!"); + } + } else { + ShouldNotReachHere(); + } + profilingEnd(cmd); +} + + + void VirtualGPU::submitVirtualMap(amd::VirtualMapCommand& vcmd) { // Make sure VirtualGPU has an exclusive access to the resources amd::ScopedLock lock(execution()); diff --git a/projects/clr/rocclr/device/pal/palvirtual.hpp b/projects/clr/rocclr/device/pal/palvirtual.hpp index 7f5cac3aa9..9483456b95 100644 --- a/projects/clr/rocclr/device/pal/palvirtual.hpp +++ b/projects/clr/rocclr/device/pal/palvirtual.hpp @@ -349,7 +349,7 @@ class VirtualGPU : public device::VirtualDevice { virtual void submitSvmUnmapMemory(amd::SvmUnmapMemoryCommand& cmd); virtual void submitTransferBufferFromFile(amd::TransferBufferFileCommand& cmd); virtual void submitVirtualMap(amd::VirtualMapCommand& cmd); - + virtual void submitStreamOperation(amd::StreamOperationCommand& cmd); void submitExternalSemaphoreCmd(amd::ExternalSemaphoreCmd& cmd); void releaseMemory(GpuMemoryReference* mem); diff --git a/projects/clr/rocclr/device/rocm/rocblit.hpp b/projects/clr/rocclr/device/rocm/rocblit.hpp index 6d9444431d..bdeef4a040 100644 --- a/projects/clr/rocclr/device/rocm/rocblit.hpp +++ b/projects/clr/rocclr/device/rocm/rocblit.hpp @@ -157,6 +157,30 @@ class DmaBlitManager : public device::HostBlitManager { bool entire = false //!< Entire buffer will be updated ) const; + //! Stream memory write operation - Write a 'value' at 'memory'. + virtual bool streamOpsWrite(device::Memory& memory, //!< Memory to write the 'value' + uint64_t value, + size_t offset, + size_t sizeBytes + ) const { + assert(!"Unimplemented"); + return false; + } + + //! Stream memory ops- Waits for a 'value' at 'memory' and wait is released based on compare op. + virtual bool streamOpsWait(device::Memory& memory, //!< Memory contents to compare the 'value' against + uint64_t value, + size_t offset, + size_t sizeBytes, + uint64_t flags, + uint64_t mask + ) const { + assert(!"Unimplemented"); + return false; + } + + + protected: static constexpr uint MaxPinnedBuffers = 4; static constexpr size_t kMaxH2dMemcpySize = 8 * Ki; @@ -427,14 +451,14 @@ class KernelBlitManager : public DmaBlitManager { ) const; //! Stream memory write operation - Write a 'value' at 'memory'. - bool streamOpsWrite(device::Memory& memory, //!< Memory to write the 'value' + virtual bool streamOpsWrite(device::Memory& memory, //!< Memory to write the 'value' uint64_t value, size_t offset, size_t sizeBytes ) const; //! Stream memory ops- Waits for a 'value' at 'memory' and wait is released based on compare op. - bool streamOpsWait(device::Memory& memory, //!< Memory contents to compare the 'value' against + virtual bool streamOpsWait(device::Memory& memory, //!< Memory contents to compare the 'value' against uint64_t value, size_t offset, size_t sizeBytes, diff --git a/projects/clr/rocclr/platform/activity.cpp b/projects/clr/rocclr/platform/activity.cpp index 2e19ef2738..e6c862274a 100644 --- a/projects/clr/rocclr/platform/activity.cpp +++ b/projects/clr/rocclr/platform/activity.cpp @@ -19,6 +19,7 @@ THE SOFTWARE. */ #include "platform/activity.hpp" +#include "platform/command_utils.hpp" ACTIVITY_PROF_INSTANCES(); @@ -59,6 +60,8 @@ const char* getOclCommandKindString(uint32_t op) { CASE_STRING(CL_COMMAND_SVM_MEMFILL, SvmMemFill) CASE_STRING(CL_COMMAND_SVM_MAP, SvmMap) CASE_STRING(CL_COMMAND_SVM_UNMAP, SvmUnmap) + CASE_STRING(ROCCLR_COMMAND_STREAM_WAIT_VALUE, StreamWait) + CASE_STRING(ROCCLR_COMMAND_STREAM_WRITE_VALUE, StreamWrite) default: case_string = "Unknown command type"; }; return case_string;