SWDEV-347670 - GPU StreamWait and StreamWrite support in Windows PAL backend
Change-Id: Ic4881305b6332e217f3d3127dce7e9d9d0a7df11
[ROCm/clr commit: 373a7d1195]
This commit is contained in:
@@ -193,6 +193,24 @@ class BlitManager : public amd::HeapObject {
|
||||
bool entire = false //!< Entire buffer will be updated
|
||||
) const = 0;
|
||||
|
||||
|
||||
//! Stream memory write operation - Write a 'value' at 'memory'.
|
||||
virtual bool streamOpsWrite(device::Memory& memory, //!< Memory to write the 'value'
|
||||
uint64_t value,
|
||||
size_t offset,
|
||||
size_t sizeBytes
|
||||
) const = 0;
|
||||
|
||||
|
||||
//! Stream memory ops- Waits for a 'value' at 'memory' and wait is released based on compare op.
|
||||
virtual bool streamOpsWait(device::Memory& memory, //!< Memory contents to compare the 'value' against
|
||||
uint64_t value,
|
||||
size_t offset,
|
||||
size_t sizeBytes,
|
||||
uint64_t flags,
|
||||
uint64_t mask
|
||||
) const = 0;
|
||||
|
||||
//! Enables synchronization on blit operations
|
||||
void enableSynchronization() { syncOperation_ = true; }
|
||||
|
||||
|
||||
@@ -983,7 +983,9 @@ void CalcRowSlicePitches(uint64_t* pitch, const int32_t* copySize, size_t rowPit
|
||||
}
|
||||
}
|
||||
|
||||
static void setArgument(amd::Kernel* kernel, size_t index, size_t size, const void* value) {
|
||||
inline void KernelBlitManager::setArgument(amd::Kernel* kernel, size_t index, size_t size,
|
||||
const void* value, size_t offset,
|
||||
const device::Memory* dev_mem, bool writeVAImmediate) const {
|
||||
const amd::KernelParameterDescriptor& desc = kernel->signature().at(index);
|
||||
|
||||
void* param = kernel->parameters().values() + desc.offset_;
|
||||
@@ -1003,7 +1005,7 @@ static void setArgument(amd::Kernel* kernel, size_t index, size_t size, const vo
|
||||
} else {
|
||||
// convert cl_mem to amd::Memory*, return false if invalid.
|
||||
LP64_SWITCH(uint32_value, uint64_value) =
|
||||
static_cast<uintptr_t>((*static_cast<Memory* const*>(value))->virtualAddress());
|
||||
static_cast<uintptr_t>((*static_cast<Memory* const*>(value))->virtualAddress()) + offset;
|
||||
reinterpret_cast<Memory**>(kernel->parameters().values() +
|
||||
kernel->parameters().memoryObjOffset())[desc.info_.arrayIndex_] =
|
||||
*static_cast<Memory* const*>(value);
|
||||
@@ -1036,7 +1038,7 @@ static void setArgument(amd::Kernel* kernel, size_t index, size_t size, const vo
|
||||
break;
|
||||
}
|
||||
|
||||
switch (argSize) {
|
||||
switch (desc.size_) {
|
||||
case sizeof(uint32_t):
|
||||
*static_cast<uint32_t*>(param) = uint32_value;
|
||||
break;
|
||||
@@ -2426,6 +2428,85 @@ bool KernelBlitManager::fillImage(device::Memory& memory, const void* pattern,
|
||||
return result;
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
bool KernelBlitManager::streamOpsWrite(device::Memory& memory, uint64_t value,
|
||||
size_t offset, size_t sizeBytes) const {
|
||||
amd::ScopedLock k(lockXferOps_);
|
||||
bool result = false;
|
||||
uint blitType = StreamOpsWrite;
|
||||
size_t dim = 1;
|
||||
size_t globalWorkOffset[1] = { 0 };
|
||||
size_t globalWorkSize[1] = { 1 };
|
||||
size_t localWorkSize[1] = { 1 };
|
||||
// Program kernels arguments for the write operation
|
||||
Memory* mem = &gpuMem(memory);
|
||||
bool is32BitWrite = (sizeBytes == sizeof(uint32_t)) ? true : false;
|
||||
// Program kernels arguments for the write operation
|
||||
if (is32BitWrite) {
|
||||
setArgument(kernels_[blitType], 0, sizeof(cl_mem), &mem, offset);
|
||||
setArgument(kernels_[blitType], 1, sizeof(cl_mem), nullptr);
|
||||
setArgument(kernels_[blitType], 2, sizeof(uint32_t), &value);
|
||||
} else {
|
||||
setArgument(kernels_[blitType], 0, sizeof(cl_mem), nullptr);
|
||||
setArgument(kernels_[blitType], 1, sizeof(cl_mem), &mem, offset);
|
||||
setArgument(kernels_[blitType], 2, sizeof(uint64_t), &value);
|
||||
}
|
||||
setArgument(kernels_[blitType], 3, sizeof(size_t), &sizeBytes);
|
||||
// Create ND range object for the kernel's execution
|
||||
amd::NDRangeContainer ndrange(dim, globalWorkOffset, globalWorkSize, localWorkSize);
|
||||
// Execute the blit
|
||||
address parameters = kernels_[blitType]->parameters().values();
|
||||
result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], parameters);
|
||||
synchronize();
|
||||
return result;
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
bool KernelBlitManager::streamOpsWait(device::Memory& memory, uint64_t value, size_t offset,
|
||||
size_t sizeBytes, uint64_t flags, uint64_t mask) const {
|
||||
amd::ScopedLock k(lockXferOps_);
|
||||
bool result = false;
|
||||
uint blitType = StreamOpsWait;
|
||||
size_t dim = 1;
|
||||
|
||||
size_t globalWorkOffset[1] = { 0 };
|
||||
size_t globalWorkSize[1] = { 1 };
|
||||
size_t localWorkSize[1] = { 1 };
|
||||
|
||||
// Program kernels arguments for the wait operation
|
||||
Memory* mem = &gpuMem(memory);
|
||||
bool is32BitWait = (sizeBytes == sizeof(uint32_t)) ? true : false;
|
||||
// Program kernels arguments for the wait operation
|
||||
if (is32BitWait) {
|
||||
setArgument(kernels_[blitType], 0, sizeof(cl_mem), &mem, offset);
|
||||
setArgument(kernels_[blitType], 1, sizeof(cl_mem), nullptr);
|
||||
setArgument(kernels_[blitType], 2, sizeof(uint32_t), &value);
|
||||
setArgument(kernels_[blitType], 3, sizeof(uint32_t), &flags);
|
||||
setArgument(kernels_[blitType], 4, sizeof(uint32_t), &mask);
|
||||
} else {
|
||||
setArgument(kernels_[blitType], 0, sizeof(cl_mem), nullptr);
|
||||
setArgument(kernels_[blitType], 1, sizeof(cl_mem), &mem, offset);
|
||||
setArgument(kernels_[blitType], 2, sizeof(uint64_t), &value);
|
||||
setArgument(kernels_[blitType], 3, sizeof(uint64_t), &flags);
|
||||
setArgument(kernels_[blitType], 4, sizeof(uint64_t), &mask);
|
||||
}
|
||||
|
||||
// Create ND range object for the kernel's execution
|
||||
amd::NDRangeContainer ndrange(dim, globalWorkOffset, globalWorkSize, localWorkSize);
|
||||
|
||||
// Execute the blit
|
||||
address parameters = kernels_[blitType]->parameters().values();
|
||||
result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], parameters);
|
||||
synchronize();
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
bool KernelBlitManager::runScheduler(device::Memory& vqueue, device::Memory& params, uint paramIdx,
|
||||
uint threads) const {
|
||||
amd::ScopedLock k(lockXferOps_);
|
||||
|
||||
@@ -155,6 +155,30 @@ class DmaBlitManager : public device::HostBlitManager {
|
||||
bool entire = false //!< Entire buffer will be updated
|
||||
) const;
|
||||
|
||||
//! Stream memory write operation - Write a 'value' at 'memory'.
|
||||
virtual bool streamOpsWrite(device::Memory& memory, //!< Memory to write the 'value'
|
||||
uint64_t value,
|
||||
size_t offset,
|
||||
size_t sizeBytes
|
||||
) const {
|
||||
assert(!"Unimplemented");
|
||||
return false;
|
||||
};
|
||||
|
||||
//! Stream memory ops- Waits for a 'value' at 'memory' and wait is released based on compare op.
|
||||
virtual bool streamOpsWait(device::Memory& memory, //!< Memory contents to compare the 'value' against
|
||||
uint64_t value,
|
||||
size_t offset,
|
||||
size_t sizeBytes,
|
||||
uint64_t flags,
|
||||
uint64_t mask
|
||||
) const {
|
||||
assert(!"Unimplemented");
|
||||
return false;
|
||||
};
|
||||
|
||||
|
||||
|
||||
protected:
|
||||
static constexpr uint MaxPinnedBuffers = 4;
|
||||
|
||||
@@ -223,7 +247,9 @@ class KernelBlitManager : public DmaBlitManager {
|
||||
FillImage,
|
||||
Scheduler,
|
||||
GwsInit,
|
||||
BlitTotal
|
||||
StreamOpsWrite,
|
||||
StreamOpsWait,
|
||||
BlitTotal,
|
||||
};
|
||||
|
||||
//! Constructor
|
||||
@@ -378,6 +404,23 @@ class KernelBlitManager : public DmaBlitManager {
|
||||
|
||||
virtual amd::Monitor* lockXfer() const { return &lockXferOps_; }
|
||||
|
||||
//! Stream memory write operation - Write a 'value' at 'memory'.
|
||||
virtual bool streamOpsWrite(device::Memory& memory, //!< Memory to write the 'value'
|
||||
uint64_t value,
|
||||
size_t offset,
|
||||
size_t sizeBytes
|
||||
) const;
|
||||
|
||||
//! Stream memory ops- Waits for a 'value' at 'memory' and wait is released based on compare op.
|
||||
virtual bool streamOpsWait(device::Memory& memory, //!< Memory contents to compare the 'value' against
|
||||
uint64_t value,
|
||||
size_t offset,
|
||||
size_t sizeBytes,
|
||||
uint64_t flags,
|
||||
uint64_t mask
|
||||
) const;
|
||||
|
||||
|
||||
private:
|
||||
static constexpr size_t MaxXferBuffers = 2;
|
||||
static constexpr uint TransferSplitSize = 3;
|
||||
@@ -408,6 +451,12 @@ class KernelBlitManager : public DmaBlitManager {
|
||||
bool createProgram(Device& device //!< Device object
|
||||
);
|
||||
|
||||
inline void setArgument(amd::Kernel* kernel, size_t index,
|
||||
size_t size, const void* value, size_t offset = 0,
|
||||
const device::Memory* dev_mem = nullptr,
|
||||
bool writeVAImmediate = false) const;
|
||||
|
||||
|
||||
//! Creates a view memory object
|
||||
Memory* createView(const Memory& parent, //!< Parent memory object
|
||||
const cl_image_format format //!< The new format for a view
|
||||
@@ -432,7 +481,8 @@ static const char* BlitName[KernelBlitManager::BlitTotal] = {
|
||||
"__amd_rocclr_copyBufferRectAligned", "__amd_rocclr_copyBuffer",
|
||||
"__amd_rocclr_copyBufferAligned", "__amd_rocclr_fillBufferAligned",
|
||||
"__amd_rocclr_fillImage", "__amd_rocclr_scheduler",
|
||||
"__amd_rocclr_gwsInit"
|
||||
"__amd_rocclr_gwsInit", "__amd_rocclr_streamOpsWrite",
|
||||
"__amd_rocclr_streamOpsWait"
|
||||
};
|
||||
|
||||
/*@}*/ // namespace pal
|
||||
|
||||
@@ -22,6 +22,25 @@ namespace pal {
|
||||
|
||||
#define BLIT_KERNEL(...) #__VA_ARGS__
|
||||
|
||||
const char* palBlitLinearSourceCode = BLIT_KERNEL(
|
||||
\n
|
||||
extern void __amd_streamOpsWrite(__global uint*, __global ulong*, ulong, ulong);
|
||||
\n
|
||||
extern void __amd_streamOpsWait(__global uint*,__global ulong*, ulong, ulong, ulong);
|
||||
\n
|
||||
__kernel void __amd_rocclr_streamOpsWrite(__global uint* ptrInt, __global ulong* ptrUlong,
|
||||
ulong value, ulong sizeBytes) {
|
||||
__amd_streamOpsWrite(ptrInt, ptrUlong, value, sizeBytes);
|
||||
}
|
||||
\n
|
||||
__kernel void __amd_rocclr_streamOpsWait(__global uint* ptrInt, __global ulong* ptrUlong,
|
||||
ulong value, ulong flags, ulong mask) {
|
||||
__amd_streamOpsWait(ptrInt, ptrUlong, value, flags, mask);
|
||||
}
|
||||
\n);
|
||||
|
||||
|
||||
|
||||
const char* SchedulerSourceCode = BLIT_KERNEL(
|
||||
\n
|
||||
extern void __amd_scheduler(__global void*, __global void*, uint);
|
||||
|
||||
@@ -821,6 +821,8 @@ Device::~Device() {
|
||||
extern const char* SchedulerSourceCode;
|
||||
extern const char* SchedulerSourceCode20;
|
||||
extern const char* GwsInitSourceCode;
|
||||
extern const char* palBlitLinearSourceCode;
|
||||
|
||||
Pal::IDevice* gDeviceList[Pal::MaxDevices] = {};
|
||||
uint32_t gStartDevice = 0;
|
||||
uint32_t gNumDevices = 0;
|
||||
@@ -2469,8 +2471,9 @@ bool Device::createBlitProgram() {
|
||||
std::string extraBlits;
|
||||
std::string ocl20;
|
||||
if (amd::IS_HIP) {
|
||||
extraBlits = palBlitLinearSourceCode;
|
||||
if (info().cooperativeGroups_) {
|
||||
extraBlits = GwsInitSourceCode;
|
||||
extraBlits.append(GwsInitSourceCode);
|
||||
}
|
||||
}
|
||||
else {
|
||||
|
||||
@@ -2134,6 +2134,48 @@ void VirtualGPU::submitSvmFreeMemory(amd::SvmFreeMemoryCommand& vcmd) {
|
||||
profilingEnd(vcmd);
|
||||
}
|
||||
|
||||
void VirtualGPU::submitStreamOperation(amd::StreamOperationCommand& cmd) {
|
||||
// Make sure VirtualGPU has an exclusive access to the resources
|
||||
amd::ScopedLock lock(execution());
|
||||
profilingBegin(cmd);
|
||||
|
||||
const cl_command_type type = cmd.type();
|
||||
const uint64_t value = cmd.value();
|
||||
const uint64_t mask = cmd.mask();
|
||||
const unsigned int flags = cmd.flags();
|
||||
const size_t sizeBytes = cmd.sizeBytes();
|
||||
const size_t offset = cmd.offset();
|
||||
|
||||
amd::Memory* amdMemory = &cmd.memory();
|
||||
Memory* memory = dev().getGpuMemory(amdMemory);
|
||||
|
||||
if (type == ROCCLR_COMMAND_STREAM_WAIT_VALUE) {
|
||||
|
||||
// Use a blit kernel to perform the wait operation
|
||||
// mask is applied on value before performing
|
||||
// the comparision defined by 'condition'
|
||||
bool result = static_cast<KernelBlitManager&>(blitMgr()).streamOpsWait(*memory, value, offset,
|
||||
sizeBytes, flags, mask);
|
||||
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "Waiting for value: 0x%lx."
|
||||
" Flags: 0x%lx mask: 0x%lx", value, flags, mask);
|
||||
if (!result) {
|
||||
LogError("submitStreamOperation: Wait failed!");
|
||||
}
|
||||
} else if (type == ROCCLR_COMMAND_STREAM_WRITE_VALUE) {
|
||||
bool result = static_cast<KernelBlitManager&>(blitMgr()).streamOpsWrite(*memory, value,
|
||||
offset, sizeBytes);
|
||||
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "Writing value: 0x%lx", value);
|
||||
if (!result) {
|
||||
LogError("submitStreamOperation: Write failed!");
|
||||
}
|
||||
} else {
|
||||
ShouldNotReachHere();
|
||||
}
|
||||
profilingEnd(cmd);
|
||||
}
|
||||
|
||||
|
||||
|
||||
void VirtualGPU::submitVirtualMap(amd::VirtualMapCommand& vcmd) {
|
||||
// Make sure VirtualGPU has an exclusive access to the resources
|
||||
amd::ScopedLock lock(execution());
|
||||
|
||||
@@ -349,7 +349,7 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
virtual void submitSvmUnmapMemory(amd::SvmUnmapMemoryCommand& cmd);
|
||||
virtual void submitTransferBufferFromFile(amd::TransferBufferFileCommand& cmd);
|
||||
virtual void submitVirtualMap(amd::VirtualMapCommand& cmd);
|
||||
|
||||
virtual void submitStreamOperation(amd::StreamOperationCommand& cmd);
|
||||
void submitExternalSemaphoreCmd(amd::ExternalSemaphoreCmd& cmd);
|
||||
|
||||
void releaseMemory(GpuMemoryReference* mem);
|
||||
|
||||
@@ -157,6 +157,30 @@ class DmaBlitManager : public device::HostBlitManager {
|
||||
bool entire = false //!< Entire buffer will be updated
|
||||
) const;
|
||||
|
||||
//! Stream memory write operation - Write a 'value' at 'memory'.
|
||||
virtual bool streamOpsWrite(device::Memory& memory, //!< Memory to write the 'value'
|
||||
uint64_t value,
|
||||
size_t offset,
|
||||
size_t sizeBytes
|
||||
) const {
|
||||
assert(!"Unimplemented");
|
||||
return false;
|
||||
}
|
||||
|
||||
//! Stream memory ops- Waits for a 'value' at 'memory' and wait is released based on compare op.
|
||||
virtual bool streamOpsWait(device::Memory& memory, //!< Memory contents to compare the 'value' against
|
||||
uint64_t value,
|
||||
size_t offset,
|
||||
size_t sizeBytes,
|
||||
uint64_t flags,
|
||||
uint64_t mask
|
||||
) const {
|
||||
assert(!"Unimplemented");
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
|
||||
protected:
|
||||
static constexpr uint MaxPinnedBuffers = 4;
|
||||
static constexpr size_t kMaxH2dMemcpySize = 8 * Ki;
|
||||
@@ -427,14 +451,14 @@ class KernelBlitManager : public DmaBlitManager {
|
||||
) const;
|
||||
|
||||
//! Stream memory write operation - Write a 'value' at 'memory'.
|
||||
bool streamOpsWrite(device::Memory& memory, //!< Memory to write the 'value'
|
||||
virtual bool streamOpsWrite(device::Memory& memory, //!< Memory to write the 'value'
|
||||
uint64_t value,
|
||||
size_t offset,
|
||||
size_t sizeBytes
|
||||
) const;
|
||||
|
||||
//! Stream memory ops- Waits for a 'value' at 'memory' and wait is released based on compare op.
|
||||
bool streamOpsWait(device::Memory& memory, //!< Memory contents to compare the 'value' against
|
||||
virtual bool streamOpsWait(device::Memory& memory, //!< Memory contents to compare the 'value' against
|
||||
uint64_t value,
|
||||
size_t offset,
|
||||
size_t sizeBytes,
|
||||
|
||||
@@ -19,6 +19,7 @@
|
||||
THE SOFTWARE. */
|
||||
|
||||
#include "platform/activity.hpp"
|
||||
#include "platform/command_utils.hpp"
|
||||
|
||||
ACTIVITY_PROF_INSTANCES();
|
||||
|
||||
@@ -59,6 +60,8 @@ const char* getOclCommandKindString(uint32_t op) {
|
||||
CASE_STRING(CL_COMMAND_SVM_MEMFILL, SvmMemFill)
|
||||
CASE_STRING(CL_COMMAND_SVM_MAP, SvmMap)
|
||||
CASE_STRING(CL_COMMAND_SVM_UNMAP, SvmUnmap)
|
||||
CASE_STRING(ROCCLR_COMMAND_STREAM_WAIT_VALUE, StreamWait)
|
||||
CASE_STRING(ROCCLR_COMMAND_STREAM_WRITE_VALUE, StreamWrite)
|
||||
default: case_string = "Unknown command type";
|
||||
};
|
||||
return case_string;
|
||||
|
||||
مرجع در شماره جدید
Block a user