diff --git a/rocclr/device/blitcl.cpp b/rocclr/device/blitcl.cpp index 54dbb9c17c..c5d5830bcd 100644 --- a/rocclr/device/blitcl.cpp +++ b/rocclr/device/blitcl.cpp @@ -34,6 +34,10 @@ const char* BlitSourceCode = BLIT_KERNELS( extern void __amd_fillBufferAligned(__global uchar*, __global ushort*, __global uint*, __global ulong*, __constant uchar*, uint, ulong, ulong); + extern void __amd_streamOpsWrite(__global uint*, __global ulong*, ulong, ulong); + + extern void __amd_streamOpsWait(__global uint*,__global ulong*, ulong, ulong, ulong); + __kernel void __amd_rocclr_copyBufferRect(__global uchar* src, __global uchar* dst, ulong4 srcRect, ulong4 dstRect, ulong4 size) { __amd_copyBufferRect(src, dst, srcRect, dstRect, size); @@ -109,6 +113,14 @@ const char* BlitSourceCode = BLIT_KERNELS( int4 patternINT4, uint4 patternUINT4, int4 origin, int4 size, uint type) { __amd_fillImage(image, patternFLOAT4, patternINT4, patternUINT4, origin, size, type); - }); + } + __kernel void __amd_rocclr_streamOpsWait(__global uint* ptrInt, __global ulong* ptrUlong, ulong value, ulong flags, + ulong mask) { + __amd_streamOpsWait(ptrInt, ptrUlong, value, flags, mask); + } + + __kernel void __amd_rocclr_streamOpsWrite(__global uint* ptrInt, __global ulong* ptrUlong, ulong value, ulong sizeBytes) { + __amd_streamOpsWrite(ptrInt, ptrUlong, value, sizeBytes); + }); } // namespace device diff --git a/rocclr/device/rocm/rocblit.cpp b/rocclr/device/rocm/rocblit.cpp index 63e4c379f0..868258689a 100644 --- a/rocclr/device/rocm/rocblit.cpp +++ b/rocclr/device/rocm/rocblit.cpp @@ -2278,6 +2278,83 @@ bool KernelBlitManager::fillImage(device::Memory& memory, const void* pattern, return result; } +// ================================================================================================ +bool KernelBlitManager::streamOpsWrite(device::Memory& memory, uint64_t value, + size_t sizeBytes) const { + amd::ScopedLock k(lockXferOps_); + bool result = false; + uint blitType = StreamOpsWrite; + size_t dim = 1; + size_t globalWorkOffset[1] = { 0 }; + size_t globalWorkSize[1] = { 1 }; + size_t localWorkSize[1] = { 1 }; + // Program kernels arguments for the write operation + cl_mem mem = as_cl(memory.owner()); + bool is32BitWrite = (sizeBytes == sizeof(uint32_t)) ? true : false; + // Program kernels arguments for the write operation + if (is32BitWrite) { + setArgument(kernels_[blitType], 0, sizeof(cl_mem), &mem); + setArgument(kernels_[blitType], 1, sizeof(cl_mem), nullptr); + setArgument(kernels_[blitType], 2, sizeof(uint32_t), &value); + } else { + setArgument(kernels_[blitType], 0, sizeof(cl_mem), nullptr); + setArgument(kernels_[blitType], 1, sizeof(cl_mem), &mem); + setArgument(kernels_[blitType], 2, sizeof(uint64_t), &value); + } + setArgument(kernels_[blitType], 3, sizeof(size_t), &sizeBytes); + // Create ND range object for the kernel's execution + amd::NDRangeContainer ndrange(dim, globalWorkOffset, globalWorkSize, localWorkSize); + // Execute the blit + address parameters = captureArguments(kernels_[blitType]); + result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], parameters, nullptr); + releaseArguments(parameters); + synchronize(); + return result; +} + +// ================================================================================================ +bool KernelBlitManager::streamOpsWait(device::Memory& memory, uint64_t value, size_t sizeBytes, + uint64_t flags, uint64_t mask) const { + amd::ScopedLock k(lockXferOps_); + bool result = false; + uint blitType = StreamOpsWait; + size_t dim = 1; + + size_t globalWorkOffset[1] = { 0 }; + size_t globalWorkSize[1] = { 1 }; + size_t localWorkSize[1] = { 1 }; + + // Program kernels arguments for the wait operation + cl_mem mem = as_cl(memory.owner()); + bool is32BitWait = (sizeBytes == sizeof(uint32_t)) ? true : false; + // Program kernels arguments for the wait operation + if (is32BitWait) { + setArgument(kernels_[blitType], 0, sizeof(cl_mem), &mem); + setArgument(kernels_[blitType], 1, sizeof(cl_mem), nullptr); + setArgument(kernels_[blitType], 2, sizeof(uint32_t), &value); + setArgument(kernels_[blitType], 3, sizeof(uint32_t), &flags); + setArgument(kernels_[blitType], 4, sizeof(uint32_t), &mask); + } else { + setArgument(kernels_[blitType], 0, sizeof(cl_mem), nullptr); + setArgument(kernels_[blitType], 1, sizeof(cl_mem), &mem); + setArgument(kernels_[blitType], 2, sizeof(uint64_t), &value); + setArgument(kernels_[blitType], 3, sizeof(uint64_t), &flags); + setArgument(kernels_[blitType], 4, sizeof(uint64_t), &mask); + } + + // Create ND range object for the kernel's execution + amd::NDRangeContainer ndrange(dim, globalWorkOffset, globalWorkSize, localWorkSize); + + // Execute the blit + address parameters = captureArguments(kernels_[blitType]); + result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], parameters, nullptr); + releaseArguments(parameters); + synchronize(); + + return result; +} +// ================================================================================================ + amd::Memory* DmaBlitManager::pinHostMemory(const void* hostMem, size_t pinSize, size_t& partial) const { size_t pinAllocSize; diff --git a/rocclr/device/rocm/rocblit.hpp b/rocclr/device/rocm/rocblit.hpp index c1a758f6bf..3de0d75bc2 100644 --- a/rocclr/device/rocm/rocblit.hpp +++ b/rocclr/device/rocm/rocblit.hpp @@ -242,6 +242,8 @@ class KernelBlitManager : public DmaBlitManager { FillImage, Scheduler, GwsInit, + StreamOpsWrite, + StreamOpsWait, BlitTotal }; @@ -387,6 +389,20 @@ class KernelBlitManager : public DmaBlitManager { bool RunGwsInit(uint32_t value //!< Initial value for GWS resource ) const; + //! Stream memory write operation - Write a 'value' at 'memory'. + bool streamOpsWrite(device::Memory& memory, //!< Memory to write the 'value' + uint64_t value, + size_t sizeBytes + ) const; + + //! Stream memory ops- Waits for a 'value' at 'memory' and wait is released based on compare op. + bool streamOpsWait(device::Memory& memory, //!< Memory contents to compare the 'value' against + uint64_t value, + size_t sizeBytes, + uint64_t flags, + uint64_t mask + ) const; + virtual amd::Monitor* lockXfer() const { return &lockXferOps_; } private: @@ -466,7 +482,7 @@ static const char* BlitName[KernelBlitManager::BlitTotal] = { "__amd_rocclr_copyBufferRectAligned", "__amd_rocclr_copyBuffer", "__amd_rocclr_copyBufferAligned", "__amd_rocclr_fillBufferAligned", "__amd_rocclr_fillImage", "__amd_rocclr_scheduler", - "__amd_rocclr_gwsInit" + "__amd_rocclr_gwsInit", "__amd_rocclr_streamOpsWrite", "__amd_rocclr_streamOpsWait" }; inline void KernelBlitManager::setArgument(amd::Kernel* kernel, size_t index, diff --git a/rocclr/device/rocm/rocvirtual.cpp b/rocclr/device/rocm/rocvirtual.cpp index 5c88bf6894..4045f09d21 100644 --- a/rocclr/device/rocm/rocvirtual.cpp +++ b/rocclr/device/rocm/rocvirtual.cpp @@ -2257,7 +2257,7 @@ void VirtualGPU::submitStreamOperation(amd::StreamOperationCommand& cmd) { profilingBegin(cmd); const cl_command_type type = cmd.type(); - const int64_t value = cmd.value(); + const uint64_t value = cmd.value(); const uint64_t mask = cmd.mask(); const unsigned int flags = cmd.flags(); const size_t sizeBytes = cmd.sizeBytes(); @@ -2267,44 +2267,58 @@ void VirtualGPU::submitStreamOperation(amd::StreamOperationCommand& cmd) { Memory* memory = dev().getRocMemory(amdMemory); if (type == ROCCLR_COMMAND_STREAM_WAIT_VALUE) { - hsa_amd_barrier_value_packet_t aqlPacket; - hsa_amd_vendor_packet_header_t header; - hsa_signal_t signal; - Buffer* buff = static_cast(memory); + if (GPU_STREAMOPS_CP_WAIT) { + hsa_amd_barrier_value_packet_t aqlPacket; + hsa_amd_vendor_packet_header_t header; + hsa_signal_t signal; + Buffer* buff = static_cast(memory); - header.header = kBarrierVendorPacketHeader; - header.AmdFormat = HSA_AMD_PACKET_TYPE_BARRIER_VALUE; - aqlPacket.signal = buff->getSignal(); - aqlPacket.completion_signal = Barriers().ActiveSignal(); + header.header = kBarrierVendorPacketHeader; + header.AmdFormat = HSA_AMD_PACKET_TYPE_BARRIER_VALUE; + aqlPacket.signal = buff->getSignal(); + aqlPacket.completion_signal = Barriers().ActiveSignal(); - // mask is always applied on value at signal before performing - // the comparision defiend by 'condition' - switch (flags) { - case ROCCLR_STREAM_WAIT_VALUE_GTE: - aqlPacket.value = value; - aqlPacket.mask = mask; - aqlPacket.cond = HSA_SIGNAL_CONDITION_GTE; - break; - case ROCCLR_STREAM_WAIT_VALUE_EQ: - aqlPacket.value = value; - aqlPacket.mask = mask; - aqlPacket.cond = HSA_SIGNAL_CONDITION_EQ; - break; - case ROCCLR_STREAM_WAIT_VALUE_AND: - aqlPacket.value = 0; - aqlPacket.mask = (value & mask); - aqlPacket.cond = HSA_SIGNAL_CONDITION_NE; - break; - case ROCCLR_STREAM_WAIT_VALUE_NOR: - aqlPacket.value = ~value & mask; - aqlPacket.mask = ~value & mask; - aqlPacket.cond = HSA_SIGNAL_CONDITION_NE; - break; - default: - ShouldNotReachHere(); - break; + // mask is always applied on value at signal before performing + // the comparision defiend by 'condition' + switch (flags) { + case ROCCLR_STREAM_WAIT_VALUE_GTE: + aqlPacket.value = value; + aqlPacket.mask = mask; + aqlPacket.cond = HSA_SIGNAL_CONDITION_GTE; + break; + case ROCCLR_STREAM_WAIT_VALUE_EQ: + aqlPacket.value = value; + aqlPacket.mask = mask; + aqlPacket.cond = HSA_SIGNAL_CONDITION_EQ; + break; + case ROCCLR_STREAM_WAIT_VALUE_AND: + aqlPacket.value = 0; + aqlPacket.mask = (value & mask); + aqlPacket.cond = HSA_SIGNAL_CONDITION_NE; + break; + case ROCCLR_STREAM_WAIT_VALUE_NOR: + aqlPacket.value = ~value & mask; + aqlPacket.mask = ~value & mask; + aqlPacket.cond = HSA_SIGNAL_CONDITION_NE; + break; + default: + ShouldNotReachHere(); + break; + } + dispatchBarrierValuePacket(&aqlPacket, header); + } + // Use a blit kernel to perform the wait operation + else { + // mask is applied on value before performing + // the comparision defined by 'condition' + bool result = static_cast(blitMgr()).streamOpsWait(*memory, value, + sizeBytes, flags, mask); + ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "Waiting for value: 0x%lx." + " Flags: 0x%lx mask: 0x%lx", value, flags, mask); + if (!result) { + LogError("submitStreamOperation: Wait failed!"); + } } - dispatchBarrierValuePacket(&aqlPacket, header); } else if (type == ROCCLR_COMMAND_STREAM_WRITE_VALUE) { amd::Coord3D origin(offset); amd::Coord3D size(sizeBytes); @@ -2312,10 +2326,12 @@ void VirtualGPU::submitStreamOperation(amd::StreamOperationCommand& cmd) { // Ensure memory ordering preceding the write dispatchBarrierPacket(kBarrierPacketReleaseHeader); - if (!fillMemory(CL_COMMAND_FILL_BUFFER, amdMemory, &value, sizeBytes, origin, size, true)) { - cmd.setStatus(CL_INVALID_OPERATION); - } + bool result = static_cast(blitMgr()).streamOpsWrite(*memory, value, + sizeBytes); ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "Writing value: 0x%lx", value); + if (!result) { + LogError("submitStreamOperation: Write failed!"); + } } else { ShouldNotReachHere(); } diff --git a/rocclr/platform/command.hpp b/rocclr/platform/command.hpp index e30d2d22cc..e4ac030009 100644 --- a/rocclr/platform/command.hpp +++ b/rocclr/platform/command.hpp @@ -666,9 +666,10 @@ class StreamOperationCommand : public OneMemoryArgCommand { offset_(offset), sizeBytes_(sizeBytes) { // Sanity check - assert((cmdType == ROCCLR_COMMAND_STREAM_WRITE_VALUE || - (cmdType == ROCCLR_COMMAND_STREAM_WAIT_VALUE && - memory_->getMemFlags() & ROCCLR_MEM_HSA_SIGNAL_MEMORY)) && + assert((cmdType == ROCCLR_COMMAND_STREAM_WRITE_VALUE) || + (cmdType == ROCCLR_COMMAND_STREAM_WAIT_VALUE) || + (cmdType == ROCCLR_COMMAND_STREAM_WAIT_VALUE && GPU_STREAMOPS_CP_WAIT && + (memory_->getMemFlags() & ROCCLR_MEM_HSA_SIGNAL_MEMORY)) && "Invalid Stream Operation"); } diff --git a/rocclr/utils/flags.hpp b/rocclr/utils/flags.hpp index db18ab3fd2..81acf7c9f7 100644 --- a/rocclr/utils/flags.hpp +++ b/rocclr/utils/flags.hpp @@ -263,7 +263,9 @@ release(size_t, PAL_PREPINNED_MEMORY_SIZE, 64, \ release(bool, AMD_CPU_AFFINITY, false, \ "Reset CPU affinity of any runtime threads") \ release(bool, ROC_USE_FGS_KERNARG, true, \ - "Use fine grain kernel args segment for supported asics") + "Use fine grain kernel args segment for supported asics") \ +release(bool, GPU_STREAMOPS_CP_WAIT, false, \ + "Force the stream wait memory operation to wait on CP.") namespace amd {