diff --git a/hipamd/include/hip/amd_detail/hip_api_trace.hpp b/hipamd/include/hip/amd_detail/hip_api_trace.hpp index 7da13a9805..acfbe1a2e0 100644 --- a/hipamd/include/hip/amd_detail/hip_api_trace.hpp +++ b/hipamd/include/hip/amd_detail/hip_api_trace.hpp @@ -61,7 +61,7 @@ // - Reset any of the *_STEP_VERSION defines to zero if the corresponding *_MAJOR_VERSION increases #define HIP_API_TABLE_STEP_VERSION 0 #define HIP_COMPILER_API_TABLE_STEP_VERSION 0 -#define HIP_RUNTIME_API_TABLE_STEP_VERSION 6 +#define HIP_RUNTIME_API_TABLE_STEP_VERSION 7 // HIP API interface typedef hipError_t (*t___hipPopCallConfiguration)(dim3* gridDim, dim3* blockDim, size_t* sharedMem, @@ -722,6 +722,8 @@ typedef hipError_t (*t_hipStreamWriteValue32)(hipStream_t stream, void* ptr, uin unsigned int flags); typedef hipError_t (*t_hipStreamWriteValue64)(hipStream_t stream, void* ptr, uint64_t value, unsigned int flags); +typedef hipError_t (*t_hipStreamBatchMemOp)(hipStream_t stream, unsigned int count, + hipStreamBatchMemOpParams* paramArray, unsigned int flags); typedef hipError_t (*t_hipTexObjectCreate)(hipTextureObject_t* pTexObject, const HIP_RESOURCE_DESC* pResDesc, const HIP_TEXTURE_DESC* pTexDesc, @@ -1519,6 +1521,9 @@ struct HipDispatchTable { // HIP_RUNTIME_API_TABLE_STEP_VERSION == 6 t_hipDeviceGetTexture1DLinearMaxWidth hipDeviceGetTexture1DLinearMaxWidth_fn; + // HIP_RUNTIME_API_TABLE_STEP_VERSION == 7 + t_hipStreamBatchMemOp hipStreamBatchMemOp_fn; + // DO NOT EDIT ABOVE! // HIP_RUNTIME_API_TABLE_STEP_VERSION == 7 diff --git a/hipamd/include/hip/amd_detail/hip_prof_str.h b/hipamd/include/hip/amd_detail/hip_prof_str.h index e11b1abcf3..9cb1e275e6 100644 --- a/hipamd/include/hip/amd_detail/hip_prof_str.h +++ b/hipamd/include/hip/amd_detail/hip_prof_str.h @@ -425,7 +425,8 @@ enum hip_api_id_t { HIP_API_ID_hipMemcpyHtoAAsync = 405, HIP_API_ID_hipSetValidDevices = 406, HIP_API_ID_hipExtHostAlloc = 407, - HIP_API_ID_LAST = 407, + HIP_API_ID_hipStreamBatchMemOp = 408, + HIP_API_ID_LAST = 408, HIP_API_ID_hipChooseDevice = HIP_API_ID_CONCAT(HIP_API_ID_,hipChooseDevice), HIP_API_ID_hipGetDeviceProperties = HIP_API_ID_CONCAT(HIP_API_ID_,hipGetDeviceProperties), @@ -545,7 +546,6 @@ static inline const char* hip_api_name(const uint32_t id) { case HIP_API_ID_hipEventQuery: return "hipEventQuery"; case HIP_API_ID_hipEventRecord: return "hipEventRecord"; case HIP_API_ID_hipEventSynchronize: return "hipEventSynchronize"; - case HIP_API_ID_hipExtGetLastError: return "hipExtGetLastError"; case HIP_API_ID_hipExtGetLinkTypeAndHopCount: return "hipExtGetLinkTypeAndHopCount"; case HIP_API_ID_hipExtLaunchKernel: return "hipExtLaunchKernel"; case HIP_API_ID_hipExtLaunchMultiKernelMultiDevice: return "hipExtLaunchMultiKernelMultiDevice"; @@ -859,6 +859,8 @@ static inline const char* hip_api_name(const uint32_t id) { case HIP_API_ID_hipUserObjectRelease: return "hipUserObjectRelease"; case HIP_API_ID_hipUserObjectRetain: return "hipUserObjectRetain"; case HIP_API_ID_hipWaitExternalSemaphoresAsync: return "hipWaitExternalSemaphoresAsync"; + case HIP_API_ID_hipExtGetLastError: return "hipExtGetLastError"; + case HIP_API_ID_hipStreamBatchMemOp: return "hipStreamBatchMemOp"; }; return "unknown"; }; @@ -1262,6 +1264,7 @@ static inline uint32_t hipApiIdByName(const char* name) { if (strcmp("hipUserObjectRelease", name) == 0) return HIP_API_ID_hipUserObjectRelease; if (strcmp("hipUserObjectRetain", name) == 0) return HIP_API_ID_hipUserObjectRetain; if (strcmp("hipWaitExternalSemaphoresAsync", name) == 0) return HIP_API_ID_hipWaitExternalSemaphoresAsync; + if (strcmp("hipStreamBatchMemOp", name) == 0) return HIP_API_ID_hipStreamBatchMemOp; return HIP_API_ID_NONE; } @@ -3623,6 +3626,13 @@ typedef struct hip_api_data_s { unsigned int numExtSems; hipStream_t stream; } hipWaitExternalSemaphoresAsync; + struct { + hipStream_t stream; + unsigned int count; + hipStreamBatchMemOpParams* paramArray; + hipStreamBatchMemOpParams paramArray__val; + unsigned int flags; + } hipStreamBatchMemOp; } args; uint64_t *phase_data; } hip_api_data_t; @@ -5892,6 +5902,15 @@ typedef struct hip_api_data_s { cb_data.args.hipStreamWriteValue64.value = (uint64_t)value; \ cb_data.args.hipStreamWriteValue64.flags = (unsigned int)flags; \ }; + +// hipStreamBatchMemOp[('hipStream_t', 'stream'), ('unsigned int', 'count'), +// ('hipStreamBatchMemOpParams*', 'paramArray'), ('unsigned int', 'flags')] +#define INIT_hipStreamBatchMemOp_CB_ARGS_DATA(cb_data) { \ + cb_data.args.hipStreamBatchMemOp.stream = (hipStream_t)stream; \ + cb_data.args.hipStreamBatchMemOp.count = (unsigned int)count; \ + cb_data.args.hipStreamBatchMemOp.paramArray= (hipStreamBatchMemOpParams*)paramArray; \ + cb_data.args.hipStreamBatchMemOp.flags = (unsigned int)flags; \ +}; // hipTexRefGetAddress[('hipDeviceptr_t*', 'dev_ptr'), ('const textureReference*', 'texRef')] #define INIT_hipTexRefGetAddress_CB_ARGS_DATA(cb_data) { \ cb_data.args.hipTexRefGetAddress.dev_ptr = (hipDeviceptr_t*)dptr; \ @@ -7527,6 +7546,11 @@ static inline void hipApiArgsInit(hip_api_id_t id, hip_api_data_t* data) { // hipStreamWriteValue64[('hipStream_t', 'stream'), ('void*', 'ptr'), ('uint64_t', 'value'), ('unsigned int', 'flags')] case HIP_API_ID_hipStreamWriteValue64: break; +// hipStreamBatchMemOp[('hipStream_t', 'stream'), ('unsigned int', 'count'), +// ('hipStreamBatchMemOpParams*', 'paramArray'), ('unsigned int', 'flags')] + case HIP_API_ID_hipStreamBatchMemOp: + if (data->args.hipStreamBatchMemOp.paramArray) data->args.hipStreamBatchMemOp.paramArray__val = *(data->args.hipStreamBatchMemOp.paramArray); + break; // hipTexRefGetAddress[('hipDeviceptr_t*', 'dev_ptr'), ('const textureReference*', 'texRef')] case HIP_API_ID_hipTexRefGetAddress: if (data->args.hipTexRefGetAddress.dev_ptr) data->args.hipTexRefGetAddress.dev_ptr__val = *(data->args.hipTexRefGetAddress.dev_ptr); @@ -10614,6 +10638,15 @@ static inline const char* hipApiString(hip_api_id_t id, const hip_api_data_t* da oss << ", flags="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamWriteValue64.flags); oss << ")"; break; + case HIP_API_ID_hipStreamBatchMemOp: + oss << "hipStreamBatchMemOp("; + oss << "stream="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamBatchMemOp.stream); + oss << ", count="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamBatchMemOp.count); + if (data->args.hipStreamBatchMemOp.paramArray == NULL) oss << ", paramArray=NULL"; + else { oss << ", paramArray="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamBatchMemOp.paramArray__val); } + oss << ", flags="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamBatchMemOp.flags); + oss << ")"; + break; case HIP_API_ID_hipTexRefGetAddress: oss << "hipTexRefGetAddress("; if (data->args.hipTexRefGetAddress.dev_ptr == NULL) oss << "dev_ptr=NULL"; diff --git a/hipamd/src/CMakeLists.txt b/hipamd/src/CMakeLists.txt index 3480dbbbfa..a276588825 100644 --- a/hipamd/src/CMakeLists.txt +++ b/hipamd/src/CMakeLists.txt @@ -299,7 +299,7 @@ if(WIN32 OR NOT BUILD_SHARED_LIBS) # rocprofiler-register is not support on Windows set(HIP_ENABLE_ROCPROFILER_REGISTER OFF) else() - option(HIP_ENABLE_ROCPROFILER_REGISTER "Enable rocprofiler-register support" ON) + option(HIP_ENABLE_ROCPROFILER_REGISTER "Enable rocprofiler-register support" ON) endif() if(HIP_ENABLE_ROCPROFILER_REGISTER) diff --git a/hipamd/src/amdhip.def b/hipamd/src/amdhip.def index a119c5e3c4..355f180d7b 100644 --- a/hipamd/src/amdhip.def +++ b/hipamd/src/amdhip.def @@ -480,3 +480,4 @@ hipGraphExecNodeSetParams hipDrvGraphMemcpyNodeSetParams hipDrvGraphMemcpyNodeGetParams hipExtHostAlloc +hipStreamBatchMemOp diff --git a/hipamd/src/hip_api_trace.cpp b/hipamd/src/hip_api_trace.cpp index c73b9cce60..b2178d78e8 100644 --- a/hipamd/src/hip_api_trace.cpp +++ b/hipamd/src/hip_api_trace.cpp @@ -602,6 +602,8 @@ hipError_t hipStreamWaitValue64(hipStream_t stream, void* ptr, uint64_t value, u uint64_t mask); hipError_t hipStreamWriteValue32(hipStream_t stream, void* ptr, uint32_t value, unsigned int flags); hipError_t hipStreamWriteValue64(hipStream_t stream, void* ptr, uint64_t value, unsigned int flags); +hipError_t hipStreamBatchMemOp(hipStream_t stream, unsigned int count, + hipStreamBatchMemOpParams* paramArray, unsigned int flags); hipError_t hipTexObjectCreate(hipTextureObject_t* pTexObject, const HIP_RESOURCE_DESC* pResDesc, const HIP_TEXTURE_DESC* pTexDesc, const HIP_RESOURCE_VIEW_DESC* pResViewDesc); @@ -1197,6 +1199,7 @@ void UpdateDispatchTable(HipDispatchTable* ptrDispatchTable) { ptrDispatchTable->hipStreamWaitValue64_fn = hip::hipStreamWaitValue64; ptrDispatchTable->hipStreamWriteValue32_fn = hip::hipStreamWriteValue32; ptrDispatchTable->hipStreamWriteValue64_fn = hip::hipStreamWriteValue64; + ptrDispatchTable->hipStreamBatchMemOp_fn = hip::hipStreamBatchMemOp; ptrDispatchTable->hipTexObjectCreate_fn = hip::hipTexObjectCreate; ptrDispatchTable->hipTexObjectDestroy_fn = hip::hipTexObjectDestroy; ptrDispatchTable->hipTexObjectGetResourceDesc_fn = hip::hipTexObjectGetResourceDesc; @@ -1887,6 +1890,9 @@ HIP_ENFORCE_ABI(HipDispatchTable, hipDrvGraphMemcpyNodeSetParams_fn, 460) HIP_ENFORCE_ABI(HipDispatchTable, hipExtHostAlloc_fn, 461) // HIP_RUNTIME_API_TABLE_STEP_VERSION == 6 HIP_ENFORCE_ABI(HipDispatchTable, hipDeviceGetTexture1DLinearMaxWidth_fn, 462) +// HIP_RUNTIME_API_TABLE_STEP_VERSION == 7 +HIP_ENFORCE_ABI(HipDispatchTable, hipStreamBatchMemOp_fn, 463); + // if HIP_ENFORCE_ABI entries are added for each new function pointer in the table, the number below // will be +1 of the number in the last HIP_ENFORCE_ABI line. E.g.: @@ -1894,9 +1900,9 @@ HIP_ENFORCE_ABI(HipDispatchTable, hipDeviceGetTexture1DLinearMaxWidth_fn, 462) // HIP_ENFORCE_ABI(, , 8) // // HIP_ENFORCE_ABI_VERSIONING(
, 9) <- 8 + 1 = 9 -HIP_ENFORCE_ABI_VERSIONING(HipDispatchTable, 463) +HIP_ENFORCE_ABI_VERSIONING(HipDispatchTable, 464) -static_assert(HIP_RUNTIME_API_TABLE_MAJOR_VERSION == 0 && HIP_RUNTIME_API_TABLE_STEP_VERSION == 6, +static_assert(HIP_RUNTIME_API_TABLE_MAJOR_VERSION == 0 && HIP_RUNTIME_API_TABLE_STEP_VERSION == 7, "If you get this error, add new HIP_ENFORCE_ABI(...) code for the new function " "pointers and then update this check so it is true"); #endif diff --git a/hipamd/src/hip_hcc.map.in b/hipamd/src/hip_hcc.map.in index ea09a35d53..1b7c323563 100644 --- a/hipamd/src/hip_hcc.map.in +++ b/hipamd/src/hip_hcc.map.in @@ -583,6 +583,7 @@ local: hip_6.3 { global: hipExtHostAlloc; + hipStreamBatchMemOp; local: *; } hip_6.2; diff --git a/hipamd/src/hip_stream_ops.cpp b/hipamd/src/hip_stream_ops.cpp index 481e839ea9..3287d47d49 100644 --- a/hipamd/src/hip_stream_ops.cpp +++ b/hipamd/src/hip_stream_ops.cpp @@ -23,8 +23,34 @@ #include "platform/command_utils.hpp" namespace hip { +hipError_t ihipBatchMemOperation(hipStream_t stream, cl_command_type cmdType, unsigned int count, + hipStreamBatchMemOpParams* paramArray, unsigned int flags) { + if (paramArray == nullptr || flags != 0 || count > 256) { + return hipErrorInvalidValue; + } + + if (!hip::isValid(stream)) { + return hipErrorContextIsDestroyed; + } + + hip::Stream* hip_stream = hip::getStream(stream); + amd::Command::EventWaitList waitList; + + amd::BatchMemoryOperationCommand* command = new amd::BatchMemoryOperationCommand( + *hip_stream, cmdType, count, flags, waitList, paramArray, sizeof(hipStreamBatchMemOpParams)); + + if (command == nullptr) { + return hipErrorOutOfMemory; + } + command->enqueue(); + command->release(); + HIP_RETURN(hipSuccess); +} + + hipError_t ihipStreamOperation(hipStream_t stream, cl_command_type cmdType, void* ptr, - uint64_t value, uint64_t mask, unsigned int flags, size_t sizeBytes) { + uint64_t value, uint64_t mask, unsigned int flags, + size_t sizeBytes) { size_t offset = 0; unsigned int outFlags = 0; @@ -136,4 +162,15 @@ hipError_t hipStreamWriteValue64(hipStream_t stream, void* ptr, uint64_t value, 0, // flags un-used for now set it to 0 sizeof(uint64_t))); } + +hipError_t hipStreamBatchMemOp(hipStream_t stream, unsigned int count, + hipStreamBatchMemOpParams* paramArray, unsigned int flags) { + HIP_INIT_API(hipStreamBatchMemOp, count, paramArray, flags); + HIP_RETURN_DURATION(ihipBatchMemOperation( + stream, + ROCCLR_COMMAND_BATCH_STREAM, + count, + paramArray, + flags)); + } } // namespace hip diff --git a/hipamd/src/hip_table_interface.cpp b/hipamd/src/hip_table_interface.cpp index 7d2ebf6fa2..237f0d8805 100644 --- a/hipamd/src/hip_table_interface.cpp +++ b/hipamd/src/hip_table_interface.cpp @@ -1413,6 +1413,10 @@ hipError_t hipStreamWriteValue64(hipStream_t stream, void* ptr, uint64_t value, unsigned int flags) { return hip::GetHipDispatchTable()->hipStreamWriteValue64_fn(stream, ptr, value, flags); } +hipError_t hipStreamBatchMemOp(hipStream_t stream, unsigned int count, + hipStreamBatchMemOpParams* paramArray, unsigned int flags) { + return hip::GetHipDispatchTable()->hipStreamBatchMemOp_fn(stream, count, paramArray, flags); +} hipError_t hipTexObjectCreate(hipTextureObject_t* pTexObject, const HIP_RESOURCE_DESC* pResDesc, const HIP_TEXTURE_DESC* pTexDesc, const HIP_RESOURCE_VIEW_DESC* pResViewDesc) { diff --git a/rocclr/device/blit.hpp b/rocclr/device/blit.hpp index 4cd8c55be4..ec98886b38 100644 --- a/rocclr/device/blit.hpp +++ b/rocclr/device/blit.hpp @@ -233,6 +233,12 @@ class BlitManager : public amd::HeapObject { uint64_t mask ) const = 0; + //! Stream batch memory operation + virtual bool batchMemOps(const void* paramArray, + size_t paramSize, + uint32_t count + ) const = 0; + //! Enables synchronization on blit operations void enableSynchronization() { syncOperation_ = true; } diff --git a/rocclr/device/blitcl.cpp b/rocclr/device/blitcl.cpp index 897ebf82c8..9abfc1e795 100644 --- a/rocclr/device/blitcl.cpp +++ b/rocclr/device/blitcl.cpp @@ -43,6 +43,8 @@ const char* BlitLinearSourceCode = BLIT_KERNELS( extern void __amd_streamOpsWait(__global uint*, __global ulong*, ulong, ulong, ulong); + extern void __amd_batchMemOp(__global void*, uint count); + extern void __ockl_dm_init_v1(ulong, ulong, uint, uint); extern void __ockl_gws_init(uint nwm1, uint rid); @@ -162,6 +164,10 @@ const char* BlitLinearSourceCode = BLIT_KERNELS( ulong4 srcRect, ulong4 dstRect, ulong4 size) { __amd_copyBufferRectAligned(src, dst, srcRect, dstRect, size); } + + __kernel void __amd_rocclr_batchMemOp(__global void* params, uint count) { + __amd_batchMemOp(params, count); + } ); const char* HipExtraSourceCode = BLIT_KERNELS( @@ -254,7 +260,6 @@ const char* BlitImageSourceCode = BLIT_KERNELS( __amd_copyImageToBuffer(src, dstUInt, dstUShort, dstUChar, srcOrigin, dstOrigin, size, format, pitch); } - ); } // namespace amd::device diff --git a/rocclr/device/device.hpp b/rocclr/device/device.hpp index f5aa124cce..28852ca0ca 100644 --- a/rocclr/device/device.hpp +++ b/rocclr/device/device.hpp @@ -92,6 +92,7 @@ class SvmMapMemoryCommand; class SvmUnmapMemoryCommand; class SvmPrefetchAsyncCommand; class StreamOperationCommand; +class BatchMemoryOperationCommand; class VirtualMapCommand; class ExternalSemaphoreCmd; class Isa; @@ -1308,6 +1309,9 @@ class VirtualDevice : public amd::HeapObject { ShouldNotReachHere(); } virtual void submitStreamOperation(amd::StreamOperationCommand& cmd) { ShouldNotReachHere(); } + virtual void submitBatchMemoryOperation(amd::BatchMemoryOperationCommand& cmd) { + ShouldNotReachHere(); + } virtual void submitVirtualMap(amd::VirtualMapCommand& cmd) { ShouldNotReachHere(); } virtual address allocKernelArguments(size_t size, size_t alignment) { return nullptr; } diff --git a/rocclr/device/pal/palblit.hpp b/rocclr/device/pal/palblit.hpp index 9099a9a47c..7df8731328 100644 --- a/rocclr/device/pal/palblit.hpp +++ b/rocclr/device/pal/palblit.hpp @@ -475,6 +475,13 @@ class KernelBlitManager : public DmaBlitManager { uint number_of_initial_blocks ) const; + //! Batch memory ops- Submits batch of streamWaits and streamWrite operations. + virtual bool batchMemOps(const void* paramArray, + size_t paramSize, + uint32_t count) const { + assert(!"Unimplemented"); + return false; + } private: static constexpr size_t MaxXferBuffers = 2; static constexpr uint TransferSplitSize = 3; diff --git a/rocclr/device/rocm/rocblit.cpp b/rocclr/device/rocm/rocblit.cpp index e96cabbcea..a9ffb4d95a 100644 --- a/rocclr/device/rocm/rocblit.cpp +++ b/rocclr/device/rocm/rocblit.cpp @@ -2511,6 +2511,38 @@ bool KernelBlitManager::streamOpsWait(device::Memory& memory, uint64_t value, si return result; } +// ================================================================================================ +bool KernelBlitManager::batchMemOps(const void* paramArray, size_t paramSize, + uint32_t count) const { + amd::ScopedLock k(lockXferOps_); + bool result = false; + uint blitType = BatchMemOp; + size_t dim = 1; + + size_t globalWorkOffset[1] = { 0 }; + size_t globalWorkSize[1] = { count }; + size_t localWorkSize[1] = { 1 }; + + // Get constant buffer and copy the array of parameters + constexpr bool kDirectVa = true; + auto constBuf = gpu().allocKernArg((count * paramSize), kCBAlignment); + memcpy(constBuf, paramArray, (count * paramSize)); + + setArgument(kernels_[blitType], 0, sizeof(cl_mem), constBuf, 0, nullptr, kDirectVa); + setArgument(kernels_[blitType], 1, sizeof(cl_mem), &count); + + // Create ND range object for the kernel's execution + amd::NDRangeContainer ndrange(dim, globalWorkOffset, globalWorkSize, localWorkSize); + + // Execute the blit + address parameters = captureArguments(kernels_[blitType]); + result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], parameters, nullptr); + releaseArguments(parameters); + synchronize(); + + return result; +} + // ================================================================================================ bool KernelBlitManager::initHeap(device::Memory* heap_to_initialize, device::Memory* initial_blocks, uint heap_size, uint number_of_initial_blocks) const { diff --git a/rocclr/device/rocm/rocblit.hpp b/rocclr/device/rocm/rocblit.hpp index 63c5b0737c..d7c6421025 100644 --- a/rocclr/device/rocm/rocblit.hpp +++ b/rocclr/device/rocm/rocblit.hpp @@ -285,6 +285,7 @@ class KernelBlitManager : public DmaBlitManager { BlitCopyImage1DA, BlitCopyImageToBuffer, BlitCopyBufferToImage, + BatchMemOp, BlitTotal }; @@ -503,6 +504,9 @@ class KernelBlitManager : public DmaBlitManager { uint64_t mask ) const; + //! Batch memory ops- Submits batch of streamWaits and streamWrite operations. + virtual bool batchMemOps(const void* paramArray, size_t paramSize, uint32_t count) const; + virtual amd::Monitor* lockXfer() const { return &lockXferOps_; } virtual bool initHeap(device::Memory* heap_to_initialize, @@ -586,13 +590,15 @@ class KernelBlitManager : public DmaBlitManager { }; static const char* BlitName[KernelBlitManager::BlitTotal] = { - "__amd_rocclr_fillBufferAligned", "__amd_rocclr_fillBufferAligned2D", "__amd_rocclr_copyBuffer", - "__amd_rocclr_copyBufferAligned", "__amd_rocclr_copyBufferRect", - "__amd_rocclr_copyBufferRectAligned", "__amd_rocclr_streamOpsWrite", "__amd_rocclr_streamOpsWait", - "__amd_rocclr_scheduler", "__amd_rocclr_gwsInit", "__amd_rocclr_initHeap", - "__amd_rocclr_fillImage", "__amd_rocclr_copyImage", "__amd_rocclr_copyImage1DA", - "__amd_rocclr_copyImageToBuffer", "__amd_rocclr_copyBufferToImage" -}; + "__amd_rocclr_fillBufferAligned", "__amd_rocclr_fillBufferAligned2D", + "__amd_rocclr_copyBuffer", "__amd_rocclr_copyBufferAligned", + "__amd_rocclr_copyBufferRect", "__amd_rocclr_copyBufferRectAligned", + "__amd_rocclr_streamOpsWrite", "__amd_rocclr_streamOpsWait", + "__amd_rocclr_scheduler", "__amd_rocclr_gwsInit", + "__amd_rocclr_initHeap", "__amd_rocclr_fillImage", + "__amd_rocclr_copyImage", "__amd_rocclr_copyImage1DA", + "__amd_rocclr_copyImageToBuffer", "__amd_rocclr_copyBufferToImage", + "__amd_rocclr_batchMemOp"}; inline void KernelBlitManager::setArgument(amd::Kernel* kernel, size_t index, size_t size, const void* value, size_t offset, diff --git a/rocclr/device/rocm/rocvirtual.cpp b/rocclr/device/rocm/rocvirtual.cpp index ba78ef27e8..482df68df9 100644 --- a/rocclr/device/rocm/rocvirtual.cpp +++ b/rocclr/device/rocm/rocvirtual.cpp @@ -2719,8 +2719,7 @@ void VirtualGPU::submitStreamOperation(amd::StreamOperationCommand& cmd) { else { // mask is applied on value before performing // the comparision defined by 'condition' - bool result = static_cast(blitMgr()).streamOpsWait(*memory, value, offset, - sizeBytes, flags, mask); + bool result = blitMgr().streamOpsWait(*memory, value, offset, sizeBytes, flags, mask); ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "Waiting for value: 0x%lx." " Flags: 0x%lx mask: 0x%lx", value, flags, mask); if (!result) { @@ -2733,8 +2732,7 @@ void VirtualGPU::submitStreamOperation(amd::StreamOperationCommand& cmd) { // Ensure memory ordering preceding the write dispatchBarrierPacket(kBarrierPacketReleaseHeader); - bool result = static_cast(blitMgr()).streamOpsWrite(*memory, value, - offset, sizeBytes); + bool result = blitMgr().streamOpsWrite(*memory, value, offset, sizeBytes); ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "Writing value: 0x%lx", value); if (!result) { LogError("submitStreamOperation: Write failed!"); @@ -2745,6 +2743,18 @@ void VirtualGPU::submitStreamOperation(amd::StreamOperationCommand& cmd) { profilingEnd(cmd); } +// ================================================================================================ +void VirtualGPU::submitBatchMemoryOperation(amd::BatchMemoryOperationCommand& cmd) { + // Make sure VirtualGPU has an exclusive access to the resources + amd::ScopedLock lock(execution()); + profilingBegin(cmd); + + bool result = blitMgr().batchMemOps(cmd.getParamPtr(), cmd.paramSize(), cmd.count()); + if (!result) { + LogError("submitBatchMemoryOperation failed!"); + } + profilingEnd(cmd); +} // ================================================================================================ void VirtualGPU::submitVirtualMap(amd::VirtualMapCommand& vcmd) { // Make sure VirtualGPU has an exclusive access to the resources diff --git a/rocclr/device/rocm/rocvirtual.hpp b/rocclr/device/rocm/rocvirtual.hpp index fad6561fb3..38a9181fa1 100644 --- a/rocclr/device/rocm/rocvirtual.hpp +++ b/rocclr/device/rocm/rocvirtual.hpp @@ -368,6 +368,7 @@ class VirtualGPU : public device::VirtualDevice { void flush(amd::Command* list = nullptr, bool wait = false); void submitFillMemory(amd::FillMemoryCommand& cmd); void submitStreamOperation(amd::StreamOperationCommand& cmd); + void submitBatchMemoryOperation(amd::BatchMemoryOperationCommand& cmd); void submitVirtualMap(amd::VirtualMapCommand& cmd); void submitMigrateMemObjects(amd::MigrateMemObjectsCommand& cmd); diff --git a/rocclr/platform/command.hpp b/rocclr/platform/command.hpp index e825bfff98..395e6bc82f 100644 --- a/rocclr/platform/command.hpp +++ b/rocclr/platform/command.hpp @@ -845,7 +845,7 @@ class FillMemoryCommand : public OneMemoryArgCommand { class StreamOperationCommand : public OneMemoryArgCommand { private: - uint64_t value_; // !< Value to Wait on or to Write. + uint64_t value_; // !< Value to Wait on or to Write. uint64_t mask_; // !< Mask to be applied on signal value for Wait operation. unsigned int flags_; // !< Flags defining the Wait condition. size_t offset_; // !< Offset into memory for Write @@ -888,6 +888,46 @@ class StreamOperationCommand : public OneMemoryArgCommand { const size_t sizeBytes() const { return sizeBytes_; } }; +/*! \brief A batch memory operation command. + * + * \details Batch operations to synchronize the stream via memory operations + * Operations are either 32-bit stream wait or write. + * Wait: All the commands issued after stream wait are not executed + * until the wait condition is true. + * Write: Writes a 32 or 64 bit vaue to the memory using a GPU Blit. + * The operations are enqueued in the order they appear in the array. + */ + +class BatchMemoryOperationCommand : public Command { + public: + BatchMemoryOperationCommand(HostQueue& queue, cl_command_type cmdType, uint32_t count, + uint32_t flags, EventWaitList& eventWaitList, const void* paramArray, + size_t paramSize) + : Command(queue, cmdType, eventWaitList), + count_(count), + paramArray_(paramArray), + flags_(flags), + paramSize_(paramSize) { + // Sanity check + assert(((cmdType == ROCCLR_COMMAND_BATCH_STREAM)) && "Invalid batch memory operation"); + } + + virtual void submit(device::VirtualDevice& device) { device.submitBatchMemoryOperation(*this); } + + //! Returns the value + const uint64_t count() const { return count_; } + //! Return the pointer to the paramList + const void* getParamPtr() { return paramArray_; } + //! Return the size of a single mem op param in bytes + const size_t paramSize() const { return paramSize_; } + + private: + uint32_t count_; // !< The number of operations in the array. + uint32_t flags_; // !< Reserved for future expansion. Must be 0. + const void* paramArray_; // !< Pointer to the array of individual operations + size_t paramSize_; // !< size in bytes of the param array passed +}; + /*! \brief A generic copy memory command * * \details Used for both buffers and images. Backends are expected @@ -1858,6 +1898,7 @@ union ComputeCommand { CopyMemoryP2PCommand cmd25; SvmPrefetchAsyncCommand cmd26; VirtualMapCommand cmd27; + BatchMemoryOperationCommand cmd28; ComputeCommand() {} ~ComputeCommand() {} }; diff --git a/rocclr/platform/command_utils.hpp b/rocclr/platform/command_utils.hpp index 254485bc95..2a706050cb 100644 --- a/rocclr/platform/command_utils.hpp +++ b/rocclr/platform/command_utils.hpp @@ -24,6 +24,7 @@ // Dummy command types for Stream Wait and Write commands. #define ROCCLR_COMMAND_STREAM_WAIT_VALUE 0x4501 #define ROCCLR_COMMAND_STREAM_WRITE_VALUE 0x4502 +#define ROCCLR_COMMAND_BATCH_STREAM 0x4503 // Stream Wait Value Conidtions #define ROCCLR_STREAM_WAIT_VALUE_GTE 0x0