diff --git a/hipamd/include/hip/amd_detail/hip_api_trace.hpp b/hipamd/include/hip/amd_detail/hip_api_trace.hpp
index 7da13a9805..acfbe1a2e0 100644
--- a/hipamd/include/hip/amd_detail/hip_api_trace.hpp
+++ b/hipamd/include/hip/amd_detail/hip_api_trace.hpp
@@ -61,7 +61,7 @@
// - Reset any of the *_STEP_VERSION defines to zero if the corresponding *_MAJOR_VERSION increases
#define HIP_API_TABLE_STEP_VERSION 0
#define HIP_COMPILER_API_TABLE_STEP_VERSION 0
-#define HIP_RUNTIME_API_TABLE_STEP_VERSION 6
+#define HIP_RUNTIME_API_TABLE_STEP_VERSION 7
// HIP API interface
typedef hipError_t (*t___hipPopCallConfiguration)(dim3* gridDim, dim3* blockDim, size_t* sharedMem,
@@ -722,6 +722,8 @@ typedef hipError_t (*t_hipStreamWriteValue32)(hipStream_t stream, void* ptr, uin
unsigned int flags);
typedef hipError_t (*t_hipStreamWriteValue64)(hipStream_t stream, void* ptr, uint64_t value,
unsigned int flags);
+typedef hipError_t (*t_hipStreamBatchMemOp)(hipStream_t stream, unsigned int count,
+ hipStreamBatchMemOpParams* paramArray, unsigned int flags);
typedef hipError_t (*t_hipTexObjectCreate)(hipTextureObject_t* pTexObject,
const HIP_RESOURCE_DESC* pResDesc,
const HIP_TEXTURE_DESC* pTexDesc,
@@ -1519,6 +1521,9 @@ struct HipDispatchTable {
// HIP_RUNTIME_API_TABLE_STEP_VERSION == 6
t_hipDeviceGetTexture1DLinearMaxWidth hipDeviceGetTexture1DLinearMaxWidth_fn;
+ // HIP_RUNTIME_API_TABLE_STEP_VERSION == 7
+ t_hipStreamBatchMemOp hipStreamBatchMemOp_fn;
+
// DO NOT EDIT ABOVE!
// HIP_RUNTIME_API_TABLE_STEP_VERSION == 7
diff --git a/hipamd/include/hip/amd_detail/hip_prof_str.h b/hipamd/include/hip/amd_detail/hip_prof_str.h
index e11b1abcf3..9cb1e275e6 100644
--- a/hipamd/include/hip/amd_detail/hip_prof_str.h
+++ b/hipamd/include/hip/amd_detail/hip_prof_str.h
@@ -425,7 +425,8 @@ enum hip_api_id_t {
HIP_API_ID_hipMemcpyHtoAAsync = 405,
HIP_API_ID_hipSetValidDevices = 406,
HIP_API_ID_hipExtHostAlloc = 407,
- HIP_API_ID_LAST = 407,
+ HIP_API_ID_hipStreamBatchMemOp = 408,
+ HIP_API_ID_LAST = 408,
HIP_API_ID_hipChooseDevice = HIP_API_ID_CONCAT(HIP_API_ID_,hipChooseDevice),
HIP_API_ID_hipGetDeviceProperties = HIP_API_ID_CONCAT(HIP_API_ID_,hipGetDeviceProperties),
@@ -545,7 +546,6 @@ static inline const char* hip_api_name(const uint32_t id) {
case HIP_API_ID_hipEventQuery: return "hipEventQuery";
case HIP_API_ID_hipEventRecord: return "hipEventRecord";
case HIP_API_ID_hipEventSynchronize: return "hipEventSynchronize";
- case HIP_API_ID_hipExtGetLastError: return "hipExtGetLastError";
case HIP_API_ID_hipExtGetLinkTypeAndHopCount: return "hipExtGetLinkTypeAndHopCount";
case HIP_API_ID_hipExtLaunchKernel: return "hipExtLaunchKernel";
case HIP_API_ID_hipExtLaunchMultiKernelMultiDevice: return "hipExtLaunchMultiKernelMultiDevice";
@@ -859,6 +859,8 @@ static inline const char* hip_api_name(const uint32_t id) {
case HIP_API_ID_hipUserObjectRelease: return "hipUserObjectRelease";
case HIP_API_ID_hipUserObjectRetain: return "hipUserObjectRetain";
case HIP_API_ID_hipWaitExternalSemaphoresAsync: return "hipWaitExternalSemaphoresAsync";
+ case HIP_API_ID_hipExtGetLastError: return "hipExtGetLastError";
+ case HIP_API_ID_hipStreamBatchMemOp: return "hipStreamBatchMemOp";
};
return "unknown";
};
@@ -1262,6 +1264,7 @@ static inline uint32_t hipApiIdByName(const char* name) {
if (strcmp("hipUserObjectRelease", name) == 0) return HIP_API_ID_hipUserObjectRelease;
if (strcmp("hipUserObjectRetain", name) == 0) return HIP_API_ID_hipUserObjectRetain;
if (strcmp("hipWaitExternalSemaphoresAsync", name) == 0) return HIP_API_ID_hipWaitExternalSemaphoresAsync;
+ if (strcmp("hipStreamBatchMemOp", name) == 0) return HIP_API_ID_hipStreamBatchMemOp;
return HIP_API_ID_NONE;
}
@@ -3623,6 +3626,13 @@ typedef struct hip_api_data_s {
unsigned int numExtSems;
hipStream_t stream;
} hipWaitExternalSemaphoresAsync;
+ struct {
+ hipStream_t stream;
+ unsigned int count;
+ hipStreamBatchMemOpParams* paramArray;
+ hipStreamBatchMemOpParams paramArray__val;
+ unsigned int flags;
+ } hipStreamBatchMemOp;
} args;
uint64_t *phase_data;
} hip_api_data_t;
@@ -5892,6 +5902,15 @@ typedef struct hip_api_data_s {
cb_data.args.hipStreamWriteValue64.value = (uint64_t)value; \
cb_data.args.hipStreamWriteValue64.flags = (unsigned int)flags; \
};
+
+// hipStreamBatchMemOp[('hipStream_t', 'stream'), ('unsigned int', 'count'),
+// ('hipStreamBatchMemOpParams*', 'paramArray'), ('unsigned int', 'flags')]
+#define INIT_hipStreamBatchMemOp_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipStreamBatchMemOp.stream = (hipStream_t)stream; \
+ cb_data.args.hipStreamBatchMemOp.count = (unsigned int)count; \
+ cb_data.args.hipStreamBatchMemOp.paramArray= (hipStreamBatchMemOpParams*)paramArray; \
+ cb_data.args.hipStreamBatchMemOp.flags = (unsigned int)flags; \
+};
// hipTexRefGetAddress[('hipDeviceptr_t*', 'dev_ptr'), ('const textureReference*', 'texRef')]
#define INIT_hipTexRefGetAddress_CB_ARGS_DATA(cb_data) { \
cb_data.args.hipTexRefGetAddress.dev_ptr = (hipDeviceptr_t*)dptr; \
@@ -7527,6 +7546,11 @@ static inline void hipApiArgsInit(hip_api_id_t id, hip_api_data_t* data) {
// hipStreamWriteValue64[('hipStream_t', 'stream'), ('void*', 'ptr'), ('uint64_t', 'value'), ('unsigned int', 'flags')]
case HIP_API_ID_hipStreamWriteValue64:
break;
+// hipStreamBatchMemOp[('hipStream_t', 'stream'), ('unsigned int', 'count'),
+// ('hipStreamBatchMemOpParams*', 'paramArray'), ('unsigned int', 'flags')]
+ case HIP_API_ID_hipStreamBatchMemOp:
+ if (data->args.hipStreamBatchMemOp.paramArray) data->args.hipStreamBatchMemOp.paramArray__val = *(data->args.hipStreamBatchMemOp.paramArray);
+ break;
// hipTexRefGetAddress[('hipDeviceptr_t*', 'dev_ptr'), ('const textureReference*', 'texRef')]
case HIP_API_ID_hipTexRefGetAddress:
if (data->args.hipTexRefGetAddress.dev_ptr) data->args.hipTexRefGetAddress.dev_ptr__val = *(data->args.hipTexRefGetAddress.dev_ptr);
@@ -10614,6 +10638,15 @@ static inline const char* hipApiString(hip_api_id_t id, const hip_api_data_t* da
oss << ", flags="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamWriteValue64.flags);
oss << ")";
break;
+ case HIP_API_ID_hipStreamBatchMemOp:
+ oss << "hipStreamBatchMemOp(";
+ oss << "stream="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamBatchMemOp.stream);
+ oss << ", count="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamBatchMemOp.count);
+ if (data->args.hipStreamBatchMemOp.paramArray == NULL) oss << ", paramArray=NULL";
+ else { oss << ", paramArray="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamBatchMemOp.paramArray__val); }
+ oss << ", flags="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamBatchMemOp.flags);
+ oss << ")";
+ break;
case HIP_API_ID_hipTexRefGetAddress:
oss << "hipTexRefGetAddress(";
if (data->args.hipTexRefGetAddress.dev_ptr == NULL) oss << "dev_ptr=NULL";
diff --git a/hipamd/src/CMakeLists.txt b/hipamd/src/CMakeLists.txt
index 3480dbbbfa..a276588825 100644
--- a/hipamd/src/CMakeLists.txt
+++ b/hipamd/src/CMakeLists.txt
@@ -299,7 +299,7 @@ if(WIN32 OR NOT BUILD_SHARED_LIBS)
# rocprofiler-register is not support on Windows
set(HIP_ENABLE_ROCPROFILER_REGISTER OFF)
else()
- option(HIP_ENABLE_ROCPROFILER_REGISTER "Enable rocprofiler-register support" ON)
+ option(HIP_ENABLE_ROCPROFILER_REGISTER "Enable rocprofiler-register support" ON)
endif()
if(HIP_ENABLE_ROCPROFILER_REGISTER)
diff --git a/hipamd/src/amdhip.def b/hipamd/src/amdhip.def
index a119c5e3c4..355f180d7b 100644
--- a/hipamd/src/amdhip.def
+++ b/hipamd/src/amdhip.def
@@ -480,3 +480,4 @@ hipGraphExecNodeSetParams
hipDrvGraphMemcpyNodeSetParams
hipDrvGraphMemcpyNodeGetParams
hipExtHostAlloc
+hipStreamBatchMemOp
diff --git a/hipamd/src/hip_api_trace.cpp b/hipamd/src/hip_api_trace.cpp
index c73b9cce60..b2178d78e8 100644
--- a/hipamd/src/hip_api_trace.cpp
+++ b/hipamd/src/hip_api_trace.cpp
@@ -602,6 +602,8 @@ hipError_t hipStreamWaitValue64(hipStream_t stream, void* ptr, uint64_t value, u
uint64_t mask);
hipError_t hipStreamWriteValue32(hipStream_t stream, void* ptr, uint32_t value, unsigned int flags);
hipError_t hipStreamWriteValue64(hipStream_t stream, void* ptr, uint64_t value, unsigned int flags);
+hipError_t hipStreamBatchMemOp(hipStream_t stream, unsigned int count,
+ hipStreamBatchMemOpParams* paramArray, unsigned int flags);
hipError_t hipTexObjectCreate(hipTextureObject_t* pTexObject, const HIP_RESOURCE_DESC* pResDesc,
const HIP_TEXTURE_DESC* pTexDesc,
const HIP_RESOURCE_VIEW_DESC* pResViewDesc);
@@ -1197,6 +1199,7 @@ void UpdateDispatchTable(HipDispatchTable* ptrDispatchTable) {
ptrDispatchTable->hipStreamWaitValue64_fn = hip::hipStreamWaitValue64;
ptrDispatchTable->hipStreamWriteValue32_fn = hip::hipStreamWriteValue32;
ptrDispatchTable->hipStreamWriteValue64_fn = hip::hipStreamWriteValue64;
+ ptrDispatchTable->hipStreamBatchMemOp_fn = hip::hipStreamBatchMemOp;
ptrDispatchTable->hipTexObjectCreate_fn = hip::hipTexObjectCreate;
ptrDispatchTable->hipTexObjectDestroy_fn = hip::hipTexObjectDestroy;
ptrDispatchTable->hipTexObjectGetResourceDesc_fn = hip::hipTexObjectGetResourceDesc;
@@ -1887,6 +1890,9 @@ HIP_ENFORCE_ABI(HipDispatchTable, hipDrvGraphMemcpyNodeSetParams_fn, 460)
HIP_ENFORCE_ABI(HipDispatchTable, hipExtHostAlloc_fn, 461)
// HIP_RUNTIME_API_TABLE_STEP_VERSION == 6
HIP_ENFORCE_ABI(HipDispatchTable, hipDeviceGetTexture1DLinearMaxWidth_fn, 462)
+// HIP_RUNTIME_API_TABLE_STEP_VERSION == 7
+HIP_ENFORCE_ABI(HipDispatchTable, hipStreamBatchMemOp_fn, 463);
+
// if HIP_ENFORCE_ABI entries are added for each new function pointer in the table, the number below
// will be +1 of the number in the last HIP_ENFORCE_ABI line. E.g.:
@@ -1894,9 +1900,9 @@ HIP_ENFORCE_ABI(HipDispatchTable, hipDeviceGetTexture1DLinearMaxWidth_fn, 462)
// HIP_ENFORCE_ABI(
, , 8)
//
// HIP_ENFORCE_ABI_VERSIONING(, 9) <- 8 + 1 = 9
-HIP_ENFORCE_ABI_VERSIONING(HipDispatchTable, 463)
+HIP_ENFORCE_ABI_VERSIONING(HipDispatchTable, 464)
-static_assert(HIP_RUNTIME_API_TABLE_MAJOR_VERSION == 0 && HIP_RUNTIME_API_TABLE_STEP_VERSION == 6,
+static_assert(HIP_RUNTIME_API_TABLE_MAJOR_VERSION == 0 && HIP_RUNTIME_API_TABLE_STEP_VERSION == 7,
"If you get this error, add new HIP_ENFORCE_ABI(...) code for the new function "
"pointers and then update this check so it is true");
#endif
diff --git a/hipamd/src/hip_hcc.map.in b/hipamd/src/hip_hcc.map.in
index ea09a35d53..1b7c323563 100644
--- a/hipamd/src/hip_hcc.map.in
+++ b/hipamd/src/hip_hcc.map.in
@@ -583,6 +583,7 @@ local:
hip_6.3 {
global:
hipExtHostAlloc;
+ hipStreamBatchMemOp;
local:
*;
} hip_6.2;
diff --git a/hipamd/src/hip_stream_ops.cpp b/hipamd/src/hip_stream_ops.cpp
index 481e839ea9..3287d47d49 100644
--- a/hipamd/src/hip_stream_ops.cpp
+++ b/hipamd/src/hip_stream_ops.cpp
@@ -23,8 +23,34 @@
#include "platform/command_utils.hpp"
namespace hip {
+hipError_t ihipBatchMemOperation(hipStream_t stream, cl_command_type cmdType, unsigned int count,
+ hipStreamBatchMemOpParams* paramArray, unsigned int flags) {
+ if (paramArray == nullptr || flags != 0 || count > 256) {
+ return hipErrorInvalidValue;
+ }
+
+ if (!hip::isValid(stream)) {
+ return hipErrorContextIsDestroyed;
+ }
+
+ hip::Stream* hip_stream = hip::getStream(stream);
+ amd::Command::EventWaitList waitList;
+
+ amd::BatchMemoryOperationCommand* command = new amd::BatchMemoryOperationCommand(
+ *hip_stream, cmdType, count, flags, waitList, paramArray, sizeof(hipStreamBatchMemOpParams));
+
+ if (command == nullptr) {
+ return hipErrorOutOfMemory;
+ }
+ command->enqueue();
+ command->release();
+ HIP_RETURN(hipSuccess);
+}
+
+
hipError_t ihipStreamOperation(hipStream_t stream, cl_command_type cmdType, void* ptr,
- uint64_t value, uint64_t mask, unsigned int flags, size_t sizeBytes) {
+ uint64_t value, uint64_t mask, unsigned int flags,
+ size_t sizeBytes) {
size_t offset = 0;
unsigned int outFlags = 0;
@@ -136,4 +162,15 @@ hipError_t hipStreamWriteValue64(hipStream_t stream, void* ptr, uint64_t value,
0, // flags un-used for now set it to 0
sizeof(uint64_t)));
}
+
+hipError_t hipStreamBatchMemOp(hipStream_t stream, unsigned int count,
+ hipStreamBatchMemOpParams* paramArray, unsigned int flags) {
+ HIP_INIT_API(hipStreamBatchMemOp, count, paramArray, flags);
+ HIP_RETURN_DURATION(ihipBatchMemOperation(
+ stream,
+ ROCCLR_COMMAND_BATCH_STREAM,
+ count,
+ paramArray,
+ flags));
+ }
} // namespace hip
diff --git a/hipamd/src/hip_table_interface.cpp b/hipamd/src/hip_table_interface.cpp
index 7d2ebf6fa2..237f0d8805 100644
--- a/hipamd/src/hip_table_interface.cpp
+++ b/hipamd/src/hip_table_interface.cpp
@@ -1413,6 +1413,10 @@ hipError_t hipStreamWriteValue64(hipStream_t stream, void* ptr, uint64_t value,
unsigned int flags) {
return hip::GetHipDispatchTable()->hipStreamWriteValue64_fn(stream, ptr, value, flags);
}
+hipError_t hipStreamBatchMemOp(hipStream_t stream, unsigned int count,
+ hipStreamBatchMemOpParams* paramArray, unsigned int flags) {
+ return hip::GetHipDispatchTable()->hipStreamBatchMemOp_fn(stream, count, paramArray, flags);
+}
hipError_t hipTexObjectCreate(hipTextureObject_t* pTexObject, const HIP_RESOURCE_DESC* pResDesc,
const HIP_TEXTURE_DESC* pTexDesc,
const HIP_RESOURCE_VIEW_DESC* pResViewDesc) {
diff --git a/rocclr/device/blit.hpp b/rocclr/device/blit.hpp
index 4cd8c55be4..ec98886b38 100644
--- a/rocclr/device/blit.hpp
+++ b/rocclr/device/blit.hpp
@@ -233,6 +233,12 @@ class BlitManager : public amd::HeapObject {
uint64_t mask
) const = 0;
+ //! Stream batch memory operation
+ virtual bool batchMemOps(const void* paramArray,
+ size_t paramSize,
+ uint32_t count
+ ) const = 0;
+
//! Enables synchronization on blit operations
void enableSynchronization() { syncOperation_ = true; }
diff --git a/rocclr/device/blitcl.cpp b/rocclr/device/blitcl.cpp
index 897ebf82c8..9abfc1e795 100644
--- a/rocclr/device/blitcl.cpp
+++ b/rocclr/device/blitcl.cpp
@@ -43,6 +43,8 @@ const char* BlitLinearSourceCode = BLIT_KERNELS(
extern void __amd_streamOpsWait(__global uint*, __global ulong*, ulong, ulong, ulong);
+ extern void __amd_batchMemOp(__global void*, uint count);
+
extern void __ockl_dm_init_v1(ulong, ulong, uint, uint);
extern void __ockl_gws_init(uint nwm1, uint rid);
@@ -162,6 +164,10 @@ const char* BlitLinearSourceCode = BLIT_KERNELS(
ulong4 srcRect, ulong4 dstRect, ulong4 size) {
__amd_copyBufferRectAligned(src, dst, srcRect, dstRect, size);
}
+
+ __kernel void __amd_rocclr_batchMemOp(__global void* params, uint count) {
+ __amd_batchMemOp(params, count);
+ }
);
const char* HipExtraSourceCode = BLIT_KERNELS(
@@ -254,7 +260,6 @@ const char* BlitImageSourceCode = BLIT_KERNELS(
__amd_copyImageToBuffer(src, dstUInt, dstUShort, dstUChar, srcOrigin, dstOrigin, size, format,
pitch);
}
-
);
} // namespace amd::device
diff --git a/rocclr/device/device.hpp b/rocclr/device/device.hpp
index f5aa124cce..28852ca0ca 100644
--- a/rocclr/device/device.hpp
+++ b/rocclr/device/device.hpp
@@ -92,6 +92,7 @@ class SvmMapMemoryCommand;
class SvmUnmapMemoryCommand;
class SvmPrefetchAsyncCommand;
class StreamOperationCommand;
+class BatchMemoryOperationCommand;
class VirtualMapCommand;
class ExternalSemaphoreCmd;
class Isa;
@@ -1308,6 +1309,9 @@ class VirtualDevice : public amd::HeapObject {
ShouldNotReachHere();
}
virtual void submitStreamOperation(amd::StreamOperationCommand& cmd) { ShouldNotReachHere(); }
+ virtual void submitBatchMemoryOperation(amd::BatchMemoryOperationCommand& cmd) {
+ ShouldNotReachHere();
+ }
virtual void submitVirtualMap(amd::VirtualMapCommand& cmd) { ShouldNotReachHere(); }
virtual address allocKernelArguments(size_t size, size_t alignment) { return nullptr; }
diff --git a/rocclr/device/pal/palblit.hpp b/rocclr/device/pal/palblit.hpp
index 9099a9a47c..7df8731328 100644
--- a/rocclr/device/pal/palblit.hpp
+++ b/rocclr/device/pal/palblit.hpp
@@ -475,6 +475,13 @@ class KernelBlitManager : public DmaBlitManager {
uint number_of_initial_blocks
) const;
+ //! Batch memory ops- Submits batch of streamWaits and streamWrite operations.
+ virtual bool batchMemOps(const void* paramArray,
+ size_t paramSize,
+ uint32_t count) const {
+ assert(!"Unimplemented");
+ return false;
+ }
private:
static constexpr size_t MaxXferBuffers = 2;
static constexpr uint TransferSplitSize = 3;
diff --git a/rocclr/device/rocm/rocblit.cpp b/rocclr/device/rocm/rocblit.cpp
index e96cabbcea..a9ffb4d95a 100644
--- a/rocclr/device/rocm/rocblit.cpp
+++ b/rocclr/device/rocm/rocblit.cpp
@@ -2511,6 +2511,38 @@ bool KernelBlitManager::streamOpsWait(device::Memory& memory, uint64_t value, si
return result;
}
+// ================================================================================================
+bool KernelBlitManager::batchMemOps(const void* paramArray, size_t paramSize,
+ uint32_t count) const {
+ amd::ScopedLock k(lockXferOps_);
+ bool result = false;
+ uint blitType = BatchMemOp;
+ size_t dim = 1;
+
+ size_t globalWorkOffset[1] = { 0 };
+ size_t globalWorkSize[1] = { count };
+ size_t localWorkSize[1] = { 1 };
+
+ // Get constant buffer and copy the array of parameters
+ constexpr bool kDirectVa = true;
+ auto constBuf = gpu().allocKernArg((count * paramSize), kCBAlignment);
+ memcpy(constBuf, paramArray, (count * paramSize));
+
+ setArgument(kernels_[blitType], 0, sizeof(cl_mem), constBuf, 0, nullptr, kDirectVa);
+ setArgument(kernels_[blitType], 1, sizeof(cl_mem), &count);
+
+ // Create ND range object for the kernel's execution
+ amd::NDRangeContainer ndrange(dim, globalWorkOffset, globalWorkSize, localWorkSize);
+
+ // Execute the blit
+ address parameters = captureArguments(kernels_[blitType]);
+ result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], parameters, nullptr);
+ releaseArguments(parameters);
+ synchronize();
+
+ return result;
+}
+
// ================================================================================================
bool KernelBlitManager::initHeap(device::Memory* heap_to_initialize, device::Memory* initial_blocks,
uint heap_size, uint number_of_initial_blocks) const {
diff --git a/rocclr/device/rocm/rocblit.hpp b/rocclr/device/rocm/rocblit.hpp
index 63c5b0737c..d7c6421025 100644
--- a/rocclr/device/rocm/rocblit.hpp
+++ b/rocclr/device/rocm/rocblit.hpp
@@ -285,6 +285,7 @@ class KernelBlitManager : public DmaBlitManager {
BlitCopyImage1DA,
BlitCopyImageToBuffer,
BlitCopyBufferToImage,
+ BatchMemOp,
BlitTotal
};
@@ -503,6 +504,9 @@ class KernelBlitManager : public DmaBlitManager {
uint64_t mask
) const;
+ //! Batch memory ops- Submits batch of streamWaits and streamWrite operations.
+ virtual bool batchMemOps(const void* paramArray, size_t paramSize, uint32_t count) const;
+
virtual amd::Monitor* lockXfer() const { return &lockXferOps_; }
virtual bool initHeap(device::Memory* heap_to_initialize,
@@ -586,13 +590,15 @@ class KernelBlitManager : public DmaBlitManager {
};
static const char* BlitName[KernelBlitManager::BlitTotal] = {
- "__amd_rocclr_fillBufferAligned", "__amd_rocclr_fillBufferAligned2D", "__amd_rocclr_copyBuffer",
- "__amd_rocclr_copyBufferAligned", "__amd_rocclr_copyBufferRect",
- "__amd_rocclr_copyBufferRectAligned", "__amd_rocclr_streamOpsWrite", "__amd_rocclr_streamOpsWait",
- "__amd_rocclr_scheduler", "__amd_rocclr_gwsInit", "__amd_rocclr_initHeap",
- "__amd_rocclr_fillImage", "__amd_rocclr_copyImage", "__amd_rocclr_copyImage1DA",
- "__amd_rocclr_copyImageToBuffer", "__amd_rocclr_copyBufferToImage"
-};
+ "__amd_rocclr_fillBufferAligned", "__amd_rocclr_fillBufferAligned2D",
+ "__amd_rocclr_copyBuffer", "__amd_rocclr_copyBufferAligned",
+ "__amd_rocclr_copyBufferRect", "__amd_rocclr_copyBufferRectAligned",
+ "__amd_rocclr_streamOpsWrite", "__amd_rocclr_streamOpsWait",
+ "__amd_rocclr_scheduler", "__amd_rocclr_gwsInit",
+ "__amd_rocclr_initHeap", "__amd_rocclr_fillImage",
+ "__amd_rocclr_copyImage", "__amd_rocclr_copyImage1DA",
+ "__amd_rocclr_copyImageToBuffer", "__amd_rocclr_copyBufferToImage",
+ "__amd_rocclr_batchMemOp"};
inline void KernelBlitManager::setArgument(amd::Kernel* kernel, size_t index,
size_t size, const void* value, size_t offset,
diff --git a/rocclr/device/rocm/rocvirtual.cpp b/rocclr/device/rocm/rocvirtual.cpp
index ba78ef27e8..482df68df9 100644
--- a/rocclr/device/rocm/rocvirtual.cpp
+++ b/rocclr/device/rocm/rocvirtual.cpp
@@ -2719,8 +2719,7 @@ void VirtualGPU::submitStreamOperation(amd::StreamOperationCommand& cmd) {
else {
// mask is applied on value before performing
// the comparision defined by 'condition'
- bool result = static_cast(blitMgr()).streamOpsWait(*memory, value, offset,
- sizeBytes, flags, mask);
+ bool result = blitMgr().streamOpsWait(*memory, value, offset, sizeBytes, flags, mask);
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "Waiting for value: 0x%lx."
" Flags: 0x%lx mask: 0x%lx", value, flags, mask);
if (!result) {
@@ -2733,8 +2732,7 @@ void VirtualGPU::submitStreamOperation(amd::StreamOperationCommand& cmd) {
// Ensure memory ordering preceding the write
dispatchBarrierPacket(kBarrierPacketReleaseHeader);
- bool result = static_cast(blitMgr()).streamOpsWrite(*memory, value,
- offset, sizeBytes);
+ bool result = blitMgr().streamOpsWrite(*memory, value, offset, sizeBytes);
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "Writing value: 0x%lx", value);
if (!result) {
LogError("submitStreamOperation: Write failed!");
@@ -2745,6 +2743,18 @@ void VirtualGPU::submitStreamOperation(amd::StreamOperationCommand& cmd) {
profilingEnd(cmd);
}
+// ================================================================================================
+void VirtualGPU::submitBatchMemoryOperation(amd::BatchMemoryOperationCommand& cmd) {
+ // Make sure VirtualGPU has an exclusive access to the resources
+ amd::ScopedLock lock(execution());
+ profilingBegin(cmd);
+
+ bool result = blitMgr().batchMemOps(cmd.getParamPtr(), cmd.paramSize(), cmd.count());
+ if (!result) {
+ LogError("submitBatchMemoryOperation failed!");
+ }
+ profilingEnd(cmd);
+}
// ================================================================================================
void VirtualGPU::submitVirtualMap(amd::VirtualMapCommand& vcmd) {
// Make sure VirtualGPU has an exclusive access to the resources
diff --git a/rocclr/device/rocm/rocvirtual.hpp b/rocclr/device/rocm/rocvirtual.hpp
index fad6561fb3..38a9181fa1 100644
--- a/rocclr/device/rocm/rocvirtual.hpp
+++ b/rocclr/device/rocm/rocvirtual.hpp
@@ -368,6 +368,7 @@ class VirtualGPU : public device::VirtualDevice {
void flush(amd::Command* list = nullptr, bool wait = false);
void submitFillMemory(amd::FillMemoryCommand& cmd);
void submitStreamOperation(amd::StreamOperationCommand& cmd);
+ void submitBatchMemoryOperation(amd::BatchMemoryOperationCommand& cmd);
void submitVirtualMap(amd::VirtualMapCommand& cmd);
void submitMigrateMemObjects(amd::MigrateMemObjectsCommand& cmd);
diff --git a/rocclr/platform/command.hpp b/rocclr/platform/command.hpp
index e825bfff98..395e6bc82f 100644
--- a/rocclr/platform/command.hpp
+++ b/rocclr/platform/command.hpp
@@ -845,7 +845,7 @@ class FillMemoryCommand : public OneMemoryArgCommand {
class StreamOperationCommand : public OneMemoryArgCommand {
private:
- uint64_t value_; // !< Value to Wait on or to Write.
+ uint64_t value_; // !< Value to Wait on or to Write.
uint64_t mask_; // !< Mask to be applied on signal value for Wait operation.
unsigned int flags_; // !< Flags defining the Wait condition.
size_t offset_; // !< Offset into memory for Write
@@ -888,6 +888,46 @@ class StreamOperationCommand : public OneMemoryArgCommand {
const size_t sizeBytes() const { return sizeBytes_; }
};
+/*! \brief A batch memory operation command.
+ *
+ * \details Batch operations to synchronize the stream via memory operations
+ * Operations are either 32-bit stream wait or write.
+ * Wait: All the commands issued after stream wait are not executed
+ * until the wait condition is true.
+ * Write: Writes a 32 or 64 bit vaue to the memory using a GPU Blit.
+ * The operations are enqueued in the order they appear in the array.
+ */
+
+class BatchMemoryOperationCommand : public Command {
+ public:
+ BatchMemoryOperationCommand(HostQueue& queue, cl_command_type cmdType, uint32_t count,
+ uint32_t flags, EventWaitList& eventWaitList, const void* paramArray,
+ size_t paramSize)
+ : Command(queue, cmdType, eventWaitList),
+ count_(count),
+ paramArray_(paramArray),
+ flags_(flags),
+ paramSize_(paramSize) {
+ // Sanity check
+ assert(((cmdType == ROCCLR_COMMAND_BATCH_STREAM)) && "Invalid batch memory operation");
+ }
+
+ virtual void submit(device::VirtualDevice& device) { device.submitBatchMemoryOperation(*this); }
+
+ //! Returns the value
+ const uint64_t count() const { return count_; }
+ //! Return the pointer to the paramList
+ const void* getParamPtr() { return paramArray_; }
+ //! Return the size of a single mem op param in bytes
+ const size_t paramSize() const { return paramSize_; }
+
+ private:
+ uint32_t count_; // !< The number of operations in the array.
+ uint32_t flags_; // !< Reserved for future expansion. Must be 0.
+ const void* paramArray_; // !< Pointer to the array of individual operations
+ size_t paramSize_; // !< size in bytes of the param array passed
+};
+
/*! \brief A generic copy memory command
*
* \details Used for both buffers and images. Backends are expected
@@ -1858,6 +1898,7 @@ union ComputeCommand {
CopyMemoryP2PCommand cmd25;
SvmPrefetchAsyncCommand cmd26;
VirtualMapCommand cmd27;
+ BatchMemoryOperationCommand cmd28;
ComputeCommand() {}
~ComputeCommand() {}
};
diff --git a/rocclr/platform/command_utils.hpp b/rocclr/platform/command_utils.hpp
index 254485bc95..2a706050cb 100644
--- a/rocclr/platform/command_utils.hpp
+++ b/rocclr/platform/command_utils.hpp
@@ -24,6 +24,7 @@
// Dummy command types for Stream Wait and Write commands.
#define ROCCLR_COMMAND_STREAM_WAIT_VALUE 0x4501
#define ROCCLR_COMMAND_STREAM_WRITE_VALUE 0x4502
+#define ROCCLR_COMMAND_BATCH_STREAM 0x4503
// Stream Wait Value Conidtions
#define ROCCLR_STREAM_WAIT_VALUE_GTE 0x0