SWDEV-440866 - [hip-roclr] Adds support to batch memory operations APIs
Change-Id: I449ffca44bbb04d13348d112e896d603c70fd485
This commit is contained in:
committed by
Sourabh Betigeri
vanhempi
c47f9dda58
commit
bd5d8e9baf
@@ -61,7 +61,7 @@
|
||||
// - Reset any of the *_STEP_VERSION defines to zero if the corresponding *_MAJOR_VERSION increases
|
||||
#define HIP_API_TABLE_STEP_VERSION 0
|
||||
#define HIP_COMPILER_API_TABLE_STEP_VERSION 0
|
||||
#define HIP_RUNTIME_API_TABLE_STEP_VERSION 6
|
||||
#define HIP_RUNTIME_API_TABLE_STEP_VERSION 7
|
||||
|
||||
// HIP API interface
|
||||
typedef hipError_t (*t___hipPopCallConfiguration)(dim3* gridDim, dim3* blockDim, size_t* sharedMem,
|
||||
@@ -722,6 +722,8 @@ typedef hipError_t (*t_hipStreamWriteValue32)(hipStream_t stream, void* ptr, uin
|
||||
unsigned int flags);
|
||||
typedef hipError_t (*t_hipStreamWriteValue64)(hipStream_t stream, void* ptr, uint64_t value,
|
||||
unsigned int flags);
|
||||
typedef hipError_t (*t_hipStreamBatchMemOp)(hipStream_t stream, unsigned int count,
|
||||
hipStreamBatchMemOpParams* paramArray, unsigned int flags);
|
||||
typedef hipError_t (*t_hipTexObjectCreate)(hipTextureObject_t* pTexObject,
|
||||
const HIP_RESOURCE_DESC* pResDesc,
|
||||
const HIP_TEXTURE_DESC* pTexDesc,
|
||||
@@ -1519,6 +1521,9 @@ struct HipDispatchTable {
|
||||
// HIP_RUNTIME_API_TABLE_STEP_VERSION == 6
|
||||
t_hipDeviceGetTexture1DLinearMaxWidth hipDeviceGetTexture1DLinearMaxWidth_fn;
|
||||
|
||||
// HIP_RUNTIME_API_TABLE_STEP_VERSION == 7
|
||||
t_hipStreamBatchMemOp hipStreamBatchMemOp_fn;
|
||||
|
||||
// DO NOT EDIT ABOVE!
|
||||
// HIP_RUNTIME_API_TABLE_STEP_VERSION == 7
|
||||
|
||||
|
||||
@@ -425,7 +425,8 @@ enum hip_api_id_t {
|
||||
HIP_API_ID_hipMemcpyHtoAAsync = 405,
|
||||
HIP_API_ID_hipSetValidDevices = 406,
|
||||
HIP_API_ID_hipExtHostAlloc = 407,
|
||||
HIP_API_ID_LAST = 407,
|
||||
HIP_API_ID_hipStreamBatchMemOp = 408,
|
||||
HIP_API_ID_LAST = 408,
|
||||
|
||||
HIP_API_ID_hipChooseDevice = HIP_API_ID_CONCAT(HIP_API_ID_,hipChooseDevice),
|
||||
HIP_API_ID_hipGetDeviceProperties = HIP_API_ID_CONCAT(HIP_API_ID_,hipGetDeviceProperties),
|
||||
@@ -545,7 +546,6 @@ static inline const char* hip_api_name(const uint32_t id) {
|
||||
case HIP_API_ID_hipEventQuery: return "hipEventQuery";
|
||||
case HIP_API_ID_hipEventRecord: return "hipEventRecord";
|
||||
case HIP_API_ID_hipEventSynchronize: return "hipEventSynchronize";
|
||||
case HIP_API_ID_hipExtGetLastError: return "hipExtGetLastError";
|
||||
case HIP_API_ID_hipExtGetLinkTypeAndHopCount: return "hipExtGetLinkTypeAndHopCount";
|
||||
case HIP_API_ID_hipExtLaunchKernel: return "hipExtLaunchKernel";
|
||||
case HIP_API_ID_hipExtLaunchMultiKernelMultiDevice: return "hipExtLaunchMultiKernelMultiDevice";
|
||||
@@ -859,6 +859,8 @@ static inline const char* hip_api_name(const uint32_t id) {
|
||||
case HIP_API_ID_hipUserObjectRelease: return "hipUserObjectRelease";
|
||||
case HIP_API_ID_hipUserObjectRetain: return "hipUserObjectRetain";
|
||||
case HIP_API_ID_hipWaitExternalSemaphoresAsync: return "hipWaitExternalSemaphoresAsync";
|
||||
case HIP_API_ID_hipExtGetLastError: return "hipExtGetLastError";
|
||||
case HIP_API_ID_hipStreamBatchMemOp: return "hipStreamBatchMemOp";
|
||||
};
|
||||
return "unknown";
|
||||
};
|
||||
@@ -1262,6 +1264,7 @@ static inline uint32_t hipApiIdByName(const char* name) {
|
||||
if (strcmp("hipUserObjectRelease", name) == 0) return HIP_API_ID_hipUserObjectRelease;
|
||||
if (strcmp("hipUserObjectRetain", name) == 0) return HIP_API_ID_hipUserObjectRetain;
|
||||
if (strcmp("hipWaitExternalSemaphoresAsync", name) == 0) return HIP_API_ID_hipWaitExternalSemaphoresAsync;
|
||||
if (strcmp("hipStreamBatchMemOp", name) == 0) return HIP_API_ID_hipStreamBatchMemOp;
|
||||
return HIP_API_ID_NONE;
|
||||
}
|
||||
|
||||
@@ -3623,6 +3626,13 @@ typedef struct hip_api_data_s {
|
||||
unsigned int numExtSems;
|
||||
hipStream_t stream;
|
||||
} hipWaitExternalSemaphoresAsync;
|
||||
struct {
|
||||
hipStream_t stream;
|
||||
unsigned int count;
|
||||
hipStreamBatchMemOpParams* paramArray;
|
||||
hipStreamBatchMemOpParams paramArray__val;
|
||||
unsigned int flags;
|
||||
} hipStreamBatchMemOp;
|
||||
} args;
|
||||
uint64_t *phase_data;
|
||||
} hip_api_data_t;
|
||||
@@ -5892,6 +5902,15 @@ typedef struct hip_api_data_s {
|
||||
cb_data.args.hipStreamWriteValue64.value = (uint64_t)value; \
|
||||
cb_data.args.hipStreamWriteValue64.flags = (unsigned int)flags; \
|
||||
};
|
||||
|
||||
// hipStreamBatchMemOp[('hipStream_t', 'stream'), ('unsigned int', 'count'),
|
||||
// ('hipStreamBatchMemOpParams*', 'paramArray'), ('unsigned int', 'flags')]
|
||||
#define INIT_hipStreamBatchMemOp_CB_ARGS_DATA(cb_data) { \
|
||||
cb_data.args.hipStreamBatchMemOp.stream = (hipStream_t)stream; \
|
||||
cb_data.args.hipStreamBatchMemOp.count = (unsigned int)count; \
|
||||
cb_data.args.hipStreamBatchMemOp.paramArray= (hipStreamBatchMemOpParams*)paramArray; \
|
||||
cb_data.args.hipStreamBatchMemOp.flags = (unsigned int)flags; \
|
||||
};
|
||||
// hipTexRefGetAddress[('hipDeviceptr_t*', 'dev_ptr'), ('const textureReference*', 'texRef')]
|
||||
#define INIT_hipTexRefGetAddress_CB_ARGS_DATA(cb_data) { \
|
||||
cb_data.args.hipTexRefGetAddress.dev_ptr = (hipDeviceptr_t*)dptr; \
|
||||
@@ -7527,6 +7546,11 @@ static inline void hipApiArgsInit(hip_api_id_t id, hip_api_data_t* data) {
|
||||
// hipStreamWriteValue64[('hipStream_t', 'stream'), ('void*', 'ptr'), ('uint64_t', 'value'), ('unsigned int', 'flags')]
|
||||
case HIP_API_ID_hipStreamWriteValue64:
|
||||
break;
|
||||
// hipStreamBatchMemOp[('hipStream_t', 'stream'), ('unsigned int', 'count'),
|
||||
// ('hipStreamBatchMemOpParams*', 'paramArray'), ('unsigned int', 'flags')]
|
||||
case HIP_API_ID_hipStreamBatchMemOp:
|
||||
if (data->args.hipStreamBatchMemOp.paramArray) data->args.hipStreamBatchMemOp.paramArray__val = *(data->args.hipStreamBatchMemOp.paramArray);
|
||||
break;
|
||||
// hipTexRefGetAddress[('hipDeviceptr_t*', 'dev_ptr'), ('const textureReference*', 'texRef')]
|
||||
case HIP_API_ID_hipTexRefGetAddress:
|
||||
if (data->args.hipTexRefGetAddress.dev_ptr) data->args.hipTexRefGetAddress.dev_ptr__val = *(data->args.hipTexRefGetAddress.dev_ptr);
|
||||
@@ -10614,6 +10638,15 @@ static inline const char* hipApiString(hip_api_id_t id, const hip_api_data_t* da
|
||||
oss << ", flags="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamWriteValue64.flags);
|
||||
oss << ")";
|
||||
break;
|
||||
case HIP_API_ID_hipStreamBatchMemOp:
|
||||
oss << "hipStreamBatchMemOp(";
|
||||
oss << "stream="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamBatchMemOp.stream);
|
||||
oss << ", count="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamBatchMemOp.count);
|
||||
if (data->args.hipStreamBatchMemOp.paramArray == NULL) oss << ", paramArray=NULL";
|
||||
else { oss << ", paramArray="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamBatchMemOp.paramArray__val); }
|
||||
oss << ", flags="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamBatchMemOp.flags);
|
||||
oss << ")";
|
||||
break;
|
||||
case HIP_API_ID_hipTexRefGetAddress:
|
||||
oss << "hipTexRefGetAddress(";
|
||||
if (data->args.hipTexRefGetAddress.dev_ptr == NULL) oss << "dev_ptr=NULL";
|
||||
|
||||
@@ -299,7 +299,7 @@ if(WIN32 OR NOT BUILD_SHARED_LIBS)
|
||||
# rocprofiler-register is not support on Windows
|
||||
set(HIP_ENABLE_ROCPROFILER_REGISTER OFF)
|
||||
else()
|
||||
option(HIP_ENABLE_ROCPROFILER_REGISTER "Enable rocprofiler-register support" ON)
|
||||
option(HIP_ENABLE_ROCPROFILER_REGISTER "Enable rocprofiler-register support" ON)
|
||||
endif()
|
||||
|
||||
if(HIP_ENABLE_ROCPROFILER_REGISTER)
|
||||
|
||||
@@ -480,3 +480,4 @@ hipGraphExecNodeSetParams
|
||||
hipDrvGraphMemcpyNodeSetParams
|
||||
hipDrvGraphMemcpyNodeGetParams
|
||||
hipExtHostAlloc
|
||||
hipStreamBatchMemOp
|
||||
|
||||
@@ -602,6 +602,8 @@ hipError_t hipStreamWaitValue64(hipStream_t stream, void* ptr, uint64_t value, u
|
||||
uint64_t mask);
|
||||
hipError_t hipStreamWriteValue32(hipStream_t stream, void* ptr, uint32_t value, unsigned int flags);
|
||||
hipError_t hipStreamWriteValue64(hipStream_t stream, void* ptr, uint64_t value, unsigned int flags);
|
||||
hipError_t hipStreamBatchMemOp(hipStream_t stream, unsigned int count, hipStreamBatchMemOpParams* paramArray,
|
||||
unsigned int flags);
|
||||
hipError_t hipTexObjectCreate(hipTextureObject_t* pTexObject, const HIP_RESOURCE_DESC* pResDesc,
|
||||
const HIP_TEXTURE_DESC* pTexDesc,
|
||||
const HIP_RESOURCE_VIEW_DESC* pResViewDesc);
|
||||
@@ -1197,6 +1199,7 @@ void UpdateDispatchTable(HipDispatchTable* ptrDispatchTable) {
|
||||
ptrDispatchTable->hipStreamWaitValue64_fn = hip::hipStreamWaitValue64;
|
||||
ptrDispatchTable->hipStreamWriteValue32_fn = hip::hipStreamWriteValue32;
|
||||
ptrDispatchTable->hipStreamWriteValue64_fn = hip::hipStreamWriteValue64;
|
||||
ptrDispatchTable->hipStreamBatchMemOp_fn = hip::hipStreamBatchMemOp;
|
||||
ptrDispatchTable->hipTexObjectCreate_fn = hip::hipTexObjectCreate;
|
||||
ptrDispatchTable->hipTexObjectDestroy_fn = hip::hipTexObjectDestroy;
|
||||
ptrDispatchTable->hipTexObjectGetResourceDesc_fn = hip::hipTexObjectGetResourceDesc;
|
||||
@@ -1887,6 +1890,9 @@ HIP_ENFORCE_ABI(HipDispatchTable, hipDrvGraphMemcpyNodeSetParams_fn, 460)
|
||||
HIP_ENFORCE_ABI(HipDispatchTable, hipExtHostAlloc_fn, 461)
|
||||
// HIP_RUNTIME_API_TABLE_STEP_VERSION == 6
|
||||
HIP_ENFORCE_ABI(HipDispatchTable, hipDeviceGetTexture1DLinearMaxWidth_fn, 462)
|
||||
// HIP_RUNTIME_API_TABLE_STEP_VERSION == 7
|
||||
HIP_ENFORCE_ABI(HipDispatchTable, hipStreamBatchMemOp_fn, 463);
|
||||
|
||||
|
||||
// if HIP_ENFORCE_ABI entries are added for each new function pointer in the table, the number below
|
||||
// will be +1 of the number in the last HIP_ENFORCE_ABI line. E.g.:
|
||||
@@ -1894,9 +1900,9 @@ HIP_ENFORCE_ABI(HipDispatchTable, hipDeviceGetTexture1DLinearMaxWidth_fn, 462)
|
||||
// HIP_ENFORCE_ABI(<table>, <functor>, 8)
|
||||
//
|
||||
// HIP_ENFORCE_ABI_VERSIONING(<table>, 9) <- 8 + 1 = 9
|
||||
HIP_ENFORCE_ABI_VERSIONING(HipDispatchTable, 463)
|
||||
HIP_ENFORCE_ABI_VERSIONING(HipDispatchTable, 464)
|
||||
|
||||
static_assert(HIP_RUNTIME_API_TABLE_MAJOR_VERSION == 0 && HIP_RUNTIME_API_TABLE_STEP_VERSION == 6,
|
||||
static_assert(HIP_RUNTIME_API_TABLE_MAJOR_VERSION == 0 && HIP_RUNTIME_API_TABLE_STEP_VERSION == 7,
|
||||
"If you get this error, add new HIP_ENFORCE_ABI(...) code for the new function "
|
||||
"pointers and then update this check so it is true");
|
||||
#endif
|
||||
|
||||
@@ -583,6 +583,7 @@ local:
|
||||
hip_6.3 {
|
||||
global:
|
||||
hipExtHostAlloc;
|
||||
hipStreamBatchMemOp;
|
||||
local:
|
||||
*;
|
||||
} hip_6.2;
|
||||
|
||||
@@ -23,8 +23,36 @@
|
||||
#include "platform/command_utils.hpp"
|
||||
|
||||
namespace hip {
|
||||
hipError_t ihipBatchMemOperation(hipStream_t stream, cl_command_type cmdType, unsigned int count,
|
||||
hipStreamBatchMemOpParams* paramArray, unsigned int flags) {
|
||||
if (paramArray == nullptr || flags != 0 || count > 256) {
|
||||
return hipErrorInvalidValue;
|
||||
}
|
||||
|
||||
if (!hip::isValid(stream)) {
|
||||
return hipErrorContextIsDestroyed;
|
||||
}
|
||||
|
||||
hip::Stream* hip_stream = hip::getStream(stream);
|
||||
amd::Command::EventWaitList waitList;
|
||||
|
||||
amd::BatchMemoryOperationCommand* command =
|
||||
new amd::BatchMemoryOperationCommand(*hip_stream, cmdType, count,
|
||||
flags, waitList, paramArray,
|
||||
sizeof(hipStreamBatchMemOpParams));
|
||||
|
||||
if (command == nullptr) {
|
||||
return hipErrorOutOfMemory;
|
||||
}
|
||||
command->enqueue();
|
||||
command->release();
|
||||
HIP_RETURN(hipSuccess);
|
||||
}
|
||||
|
||||
|
||||
hipError_t ihipStreamOperation(hipStream_t stream, cl_command_type cmdType, void* ptr,
|
||||
uint64_t value, uint64_t mask, unsigned int flags, size_t sizeBytes) {
|
||||
uint64_t value, uint64_t mask, unsigned int flags,
|
||||
size_t sizeBytes) {
|
||||
size_t offset = 0;
|
||||
unsigned int outFlags = 0;
|
||||
|
||||
@@ -136,4 +164,15 @@ hipError_t hipStreamWriteValue64(hipStream_t stream, void* ptr, uint64_t value,
|
||||
0, // flags un-used for now set it to 0
|
||||
sizeof(uint64_t)));
|
||||
}
|
||||
|
||||
hipError_t hipStreamBatchMemOp(hipStream_t stream, unsigned int count,
|
||||
hipStreamBatchMemOpParams* paramArray, unsigned int flags) {
|
||||
HIP_INIT_API(hipStreamBatchMemOp, count, paramArray, flags);
|
||||
HIP_RETURN_DURATION(ihipBatchMemOperation(
|
||||
stream,
|
||||
ROCCLR_COMMAND_BATCH_STREAM,
|
||||
count,
|
||||
paramArray,
|
||||
flags));
|
||||
}
|
||||
} // namespace hip
|
||||
|
||||
@@ -1413,6 +1413,10 @@ hipError_t hipStreamWriteValue64(hipStream_t stream, void* ptr, uint64_t value,
|
||||
unsigned int flags) {
|
||||
return hip::GetHipDispatchTable()->hipStreamWriteValue64_fn(stream, ptr, value, flags);
|
||||
}
|
||||
hipError_t hipStreamBatchMemOp(hipStream_t stream, unsigned int count,
|
||||
hipStreamBatchMemOpParams* paramArray, unsigned int flags) {
|
||||
return hip::GetHipDispatchTable()->hipStreamBatchMemOp_fn(stream, count, paramArray, flags);
|
||||
}
|
||||
hipError_t hipTexObjectCreate(hipTextureObject_t* pTexObject, const HIP_RESOURCE_DESC* pResDesc,
|
||||
const HIP_TEXTURE_DESC* pTexDesc,
|
||||
const HIP_RESOURCE_VIEW_DESC* pResViewDesc) {
|
||||
|
||||
@@ -233,6 +233,12 @@ class BlitManager : public amd::HeapObject {
|
||||
uint64_t mask
|
||||
) const = 0;
|
||||
|
||||
//! Stream batch memory operation
|
||||
virtual bool batchMemOps(const void* paramArray,
|
||||
size_t paramSize,
|
||||
uint32_t count
|
||||
) const = 0;
|
||||
|
||||
//! Enables synchronization on blit operations
|
||||
void enableSynchronization() { syncOperation_ = true; }
|
||||
|
||||
|
||||
@@ -43,6 +43,8 @@ const char* BlitLinearSourceCode = BLIT_KERNELS(
|
||||
|
||||
extern void __amd_streamOpsWait(__global uint*, __global ulong*, ulong, ulong, ulong);
|
||||
|
||||
extern void __amd_batchMemOp(__global void*, uint count);
|
||||
|
||||
extern void __ockl_dm_init_v1(ulong, ulong, uint, uint);
|
||||
|
||||
extern void __ockl_gws_init(uint nwm1, uint rid);
|
||||
@@ -162,6 +164,10 @@ const char* BlitLinearSourceCode = BLIT_KERNELS(
|
||||
ulong4 srcRect, ulong4 dstRect, ulong4 size) {
|
||||
__amd_copyBufferRectAligned(src, dst, srcRect, dstRect, size);
|
||||
}
|
||||
|
||||
__kernel void __amd_rocclr_batchMemOp(__global void* params, uint count) {
|
||||
__amd_batchMemOp(params, count);
|
||||
}
|
||||
);
|
||||
|
||||
const char* HipExtraSourceCode = BLIT_KERNELS(
|
||||
@@ -254,7 +260,6 @@ const char* BlitImageSourceCode = BLIT_KERNELS(
|
||||
__amd_copyImageToBuffer(src, dstUInt, dstUShort, dstUChar, srcOrigin, dstOrigin, size, format,
|
||||
pitch);
|
||||
}
|
||||
|
||||
);
|
||||
|
||||
} // namespace amd::device
|
||||
|
||||
@@ -92,6 +92,7 @@ class SvmMapMemoryCommand;
|
||||
class SvmUnmapMemoryCommand;
|
||||
class SvmPrefetchAsyncCommand;
|
||||
class StreamOperationCommand;
|
||||
class BatchMemoryOperationCommand;
|
||||
class VirtualMapCommand;
|
||||
class ExternalSemaphoreCmd;
|
||||
class Isa;
|
||||
@@ -1308,6 +1309,9 @@ class VirtualDevice : public amd::HeapObject {
|
||||
ShouldNotReachHere();
|
||||
}
|
||||
virtual void submitStreamOperation(amd::StreamOperationCommand& cmd) { ShouldNotReachHere(); }
|
||||
virtual void submitBatchMemoryOperation(amd::BatchMemoryOperationCommand& cmd) {
|
||||
ShouldNotReachHere();
|
||||
}
|
||||
virtual void submitVirtualMap(amd::VirtualMapCommand& cmd) { ShouldNotReachHere(); }
|
||||
|
||||
virtual address allocKernelArguments(size_t size, size_t alignment) { return nullptr; }
|
||||
|
||||
@@ -475,6 +475,13 @@ class KernelBlitManager : public DmaBlitManager {
|
||||
uint number_of_initial_blocks
|
||||
) const;
|
||||
|
||||
//! Batch memory ops- Submits batch of streamWaits and streamWrite operations.
|
||||
virtual bool batchMemOps(const void* paramArray,
|
||||
size_t paramSize,
|
||||
uint32_t count) const {
|
||||
assert(!"Unimplemented");
|
||||
return false;
|
||||
}
|
||||
private:
|
||||
static constexpr size_t MaxXferBuffers = 2;
|
||||
static constexpr uint TransferSplitSize = 3;
|
||||
|
||||
@@ -2583,6 +2583,38 @@ bool KernelBlitManager::streamOpsWait(device::Memory& memory, uint64_t value, si
|
||||
return result;
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
bool KernelBlitManager::batchMemOps(const void* paramArray, size_t paramSize,
|
||||
uint32_t count) const {
|
||||
amd::ScopedLock k(lockXferOps_);
|
||||
bool result = false;
|
||||
uint blitType = BatchMemOp;
|
||||
size_t dim = 1;
|
||||
|
||||
size_t globalWorkOffset[1] = { 0 };
|
||||
size_t globalWorkSize[1] = { count };
|
||||
size_t localWorkSize[1] = { 1 };
|
||||
|
||||
// Get constant buffer and copy the array of parameters
|
||||
constexpr bool kDirectVa = true;
|
||||
auto constBuf = gpu().allocKernArg((count * paramSize), kCBAlignment);
|
||||
memcpy(constBuf, paramArray, (count * paramSize));
|
||||
|
||||
setArgument(kernels_[blitType], 0, sizeof(cl_mem), constBuf, 0, nullptr, kDirectVa);
|
||||
setArgument(kernels_[blitType], 1, sizeof(cl_mem), &count);
|
||||
|
||||
// Create ND range object for the kernel's execution
|
||||
amd::NDRangeContainer ndrange(dim, globalWorkOffset, globalWorkSize, localWorkSize);
|
||||
|
||||
// Execute the blit
|
||||
address parameters = captureArguments(kernels_[blitType]);
|
||||
result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], parameters, nullptr);
|
||||
releaseArguments(parameters);
|
||||
synchronize();
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
bool KernelBlitManager::initHeap(device::Memory* heap_to_initialize, device::Memory* initial_blocks,
|
||||
uint heap_size, uint number_of_initial_blocks) const {
|
||||
|
||||
@@ -292,6 +292,7 @@ class KernelBlitManager : public DmaBlitManager {
|
||||
BlitCopyBufferRectAligned,
|
||||
StreamOpsWrite,
|
||||
StreamOpsWait,
|
||||
BatchMemOp,
|
||||
Scheduler,
|
||||
GwsInit,
|
||||
InitHeap,
|
||||
@@ -519,6 +520,9 @@ class KernelBlitManager : public DmaBlitManager {
|
||||
uint64_t mask
|
||||
) const;
|
||||
|
||||
//! Batch memory ops- Submits batch of streamWaits and streamWrite operations.
|
||||
virtual bool batchMemOps(const void* paramArray, size_t paramSize, uint32_t count) const;
|
||||
|
||||
virtual amd::Monitor* lockXfer() const { return &lockXferOps_; }
|
||||
|
||||
virtual bool initHeap(device::Memory* heap_to_initialize,
|
||||
@@ -599,9 +603,9 @@ static const char* BlitName[KernelBlitManager::BlitTotal] = {
|
||||
"__amd_rocclr_fillBufferAligned", "__amd_rocclr_fillBufferAligned2D", "__amd_rocclr_copyBuffer",
|
||||
"__amd_rocclr_copyBufferAligned", "__amd_rocclr_copyBufferRect",
|
||||
"__amd_rocclr_copyBufferRectAligned", "__amd_rocclr_streamOpsWrite", "__amd_rocclr_streamOpsWait",
|
||||
"__amd_rocclr_scheduler", "__amd_rocclr_gwsInit", "__amd_rocclr_initHeap",
|
||||
"__amd_rocclr_fillImage", "__amd_rocclr_copyImage", "__amd_rocclr_copyImage1DA",
|
||||
"__amd_rocclr_copyImageToBuffer", "__amd_rocclr_copyBufferToImage"
|
||||
"__amd_rocclr_batchMemOp", "__amd_rocclr_scheduler", "__amd_rocclr_gwsInit",
|
||||
"__amd_rocclr_initHeap", "__amd_rocclr_fillImage", "__amd_rocclr_copyImage",
|
||||
"__amd_rocclr_copyImage1DA", "__amd_rocclr_copyImageToBuffer", "__amd_rocclr_copyBufferToImage"
|
||||
};
|
||||
|
||||
inline void KernelBlitManager::setArgument(amd::Kernel* kernel, size_t index,
|
||||
|
||||
@@ -2715,8 +2715,7 @@ void VirtualGPU::submitStreamOperation(amd::StreamOperationCommand& cmd) {
|
||||
else {
|
||||
// mask is applied on value before performing
|
||||
// the comparision defined by 'condition'
|
||||
bool result = static_cast<KernelBlitManager&>(blitMgr()).streamOpsWait(*memory, value, offset,
|
||||
sizeBytes, flags, mask);
|
||||
bool result = blitMgr().streamOpsWait(*memory, value, offset, sizeBytes, flags, mask);
|
||||
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "Waiting for value: 0x%lx."
|
||||
" Flags: 0x%lx mask: 0x%lx", value, flags, mask);
|
||||
if (!result) {
|
||||
@@ -2729,8 +2728,7 @@ void VirtualGPU::submitStreamOperation(amd::StreamOperationCommand& cmd) {
|
||||
// Ensure memory ordering preceding the write
|
||||
dispatchBarrierPacket(kBarrierPacketReleaseHeader);
|
||||
|
||||
bool result = static_cast<KernelBlitManager&>(blitMgr()).streamOpsWrite(*memory, value,
|
||||
offset, sizeBytes);
|
||||
bool result = blitMgr().streamOpsWrite(*memory, value, offset, sizeBytes);
|
||||
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "Writing value: 0x%lx", value);
|
||||
if (!result) {
|
||||
LogError("submitStreamOperation: Write failed!");
|
||||
@@ -2741,6 +2739,18 @@ void VirtualGPU::submitStreamOperation(amd::StreamOperationCommand& cmd) {
|
||||
profilingEnd(cmd);
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
void VirtualGPU::submitBatchMemoryOperation(amd::BatchMemoryOperationCommand& cmd) {
|
||||
// Make sure VirtualGPU has an exclusive access to the resources
|
||||
amd::ScopedLock lock(execution());
|
||||
profilingBegin(cmd);
|
||||
|
||||
bool result = blitMgr().batchMemOps(cmd.getParamPtr(), cmd.paramSize(), cmd.count());
|
||||
if (!result) {
|
||||
LogError("submitBatchMemoryOperation failed!");
|
||||
}
|
||||
profilingEnd(cmd);
|
||||
}
|
||||
// ================================================================================================
|
||||
void VirtualGPU::submitVirtualMap(amd::VirtualMapCommand& vcmd) {
|
||||
// Make sure VirtualGPU has an exclusive access to the resources
|
||||
|
||||
@@ -353,6 +353,7 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
void flush(amd::Command* list = nullptr, bool wait = false);
|
||||
void submitFillMemory(amd::FillMemoryCommand& cmd);
|
||||
void submitStreamOperation(amd::StreamOperationCommand& cmd);
|
||||
void submitBatchMemoryOperation(amd::BatchMemoryOperationCommand& cmd);
|
||||
void submitVirtualMap(amd::VirtualMapCommand& cmd);
|
||||
void submitMigrateMemObjects(amd::MigrateMemObjectsCommand& cmd);
|
||||
|
||||
|
||||
@@ -845,7 +845,7 @@ class FillMemoryCommand : public OneMemoryArgCommand {
|
||||
|
||||
class StreamOperationCommand : public OneMemoryArgCommand {
|
||||
private:
|
||||
uint64_t value_; // !< Value to Wait on or to Write.
|
||||
uint64_t value_; // !< Value to Wait on or to Write.
|
||||
uint64_t mask_; // !< Mask to be applied on signal value for Wait operation.
|
||||
unsigned int flags_; // !< Flags defining the Wait condition.
|
||||
size_t offset_; // !< Offset into memory for Write
|
||||
@@ -888,6 +888,47 @@ class StreamOperationCommand : public OneMemoryArgCommand {
|
||||
const size_t sizeBytes() const { return sizeBytes_; }
|
||||
};
|
||||
|
||||
/*! \brief A batch memory operation command.
|
||||
*
|
||||
* \details Batch operations to synchronize the stream via memory operations
|
||||
* Operations are either 32-bit stream wait or write.
|
||||
* Wait: All the commands issued after stream wait are not executed
|
||||
* until the wait condition is true.
|
||||
* Write: Writes a 32 or 64 bit vaue to the memory using a GPU Blit.
|
||||
* The operations are enqueued in the order they appear in the array.
|
||||
*/
|
||||
|
||||
class BatchMemoryOperationCommand : public Command {
|
||||
public:
|
||||
BatchMemoryOperationCommand(HostQueue& queue, cl_command_type cmdType, uint32_t count,
|
||||
uint32_t flags, EventWaitList& eventWaitList, const void* paramArray,
|
||||
size_t paramSize)
|
||||
: Command(queue, cmdType, eventWaitList),
|
||||
count_(count),
|
||||
paramArray_(paramArray),
|
||||
flags_(flags),
|
||||
paramSize_(paramSize) {
|
||||
// Sanity check
|
||||
assert(((cmdType == ROCCLR_COMMAND_BATCH_STREAM)) &&
|
||||
"Invalid batch memory operation");
|
||||
}
|
||||
|
||||
virtual void submit(device::VirtualDevice& device) { device.submitBatchMemoryOperation(*this); }
|
||||
|
||||
//! Returns the value
|
||||
const uint64_t count() const { return count_; }
|
||||
//! Return the pointer to the paramList
|
||||
const void* getParamPtr() {return paramArray_; }
|
||||
//! Return the size of a single mem op param in bytes
|
||||
const size_t paramSize() const { return paramSize_; }
|
||||
|
||||
private:
|
||||
uint32_t count_; // !< The number of operations in the array.
|
||||
uint32_t flags_; // !< Reserved for future expansion. Must be 0.
|
||||
const void* paramArray_; // !< Pointer to the array of individual operations
|
||||
size_t paramSize_; // !< size in bytes of the param array passed
|
||||
};
|
||||
|
||||
/*! \brief A generic copy memory command
|
||||
*
|
||||
* \details Used for both buffers and images. Backends are expected
|
||||
@@ -1858,6 +1899,7 @@ union ComputeCommand {
|
||||
CopyMemoryP2PCommand cmd25;
|
||||
SvmPrefetchAsyncCommand cmd26;
|
||||
VirtualMapCommand cmd27;
|
||||
BatchMemoryOperationCommand cmd28;
|
||||
ComputeCommand() {}
|
||||
~ComputeCommand() {}
|
||||
};
|
||||
|
||||
@@ -24,6 +24,7 @@
|
||||
// Dummy command types for Stream Wait and Write commands.
|
||||
#define ROCCLR_COMMAND_STREAM_WAIT_VALUE 0x4501
|
||||
#define ROCCLR_COMMAND_STREAM_WRITE_VALUE 0x4502
|
||||
#define ROCCLR_COMMAND_BATCH_STREAM 0x4503
|
||||
|
||||
// Stream Wait Value Conidtions
|
||||
#define ROCCLR_STREAM_WAIT_VALUE_GTE 0x0
|
||||
|
||||
Viittaa uudesa ongelmassa
Block a user