From d4caaa51a2eb3d659ff61b566dc668b9a7960f72 Mon Sep 17 00:00:00 2001
From: foreman
Date: Mon, 19 Aug 2019 17:39:27 -0400
Subject: [PATCH] P4 to Git Change 1985445 by
skudchad@skudchad_test2_win_opencl on 2019/08/19 17:35:03
SWDEV-198861 - Implement VDI equivalent to HCC_SERIALIZE_KERNEL/HCC_SERIALIZE_COPY
- Use env var AMD_SERIALIZE_COPY/AMD_SERIALIZE_KERNEL
0x1 - Wait for complete before command is submitted
0x2 - Wait for complete after submission
0x3 - Wait both
ReviewBoardURL = http://ocltc.amd.com/reviews/r/17843/diff/
Affected files ...
... //depot/stg/opencl/drivers/opencl/runtime/platform/command.cpp#95 edit
... //depot/stg/opencl/drivers/opencl/runtime/platform/command.hpp#93 edit
... //depot/stg/opencl/drivers/opencl/runtime/platform/commandqueue.cpp#30 edit
... //depot/stg/opencl/drivers/opencl/runtime/utils/flags.hpp#316 edit
---
rocclr/runtime/platform/command.cpp | 11 ++++---
rocclr/runtime/platform/command.hpp | 42 +++++++++++++++---------
rocclr/runtime/platform/commandqueue.cpp | 11 +++++--
rocclr/runtime/utils/flags.hpp | 8 ++++-
4 files changed, 48 insertions(+), 24 deletions(-)
diff --git a/rocclr/runtime/platform/command.cpp b/rocclr/runtime/platform/command.cpp
index 83bbb16fd1..b540d6b8fc 100644
--- a/rocclr/runtime/platform/command.cpp
+++ b/rocclr/runtime/platform/command.cpp
@@ -187,14 +187,16 @@ bool Event::notifyCmdQueue() {
const Event::EventWaitList Event::nullWaitList(0);
-Command::Command(HostQueue& queue, cl_command_type type, const EventWaitList& eventWaitList)
+Command::Command(HostQueue& queue, cl_command_type type,
+ const EventWaitList& eventWaitList, uint32_t commandWaitBits)
: Event(queue),
queue_(&queue),
next_(NULL),
type_(type),
exception_(0),
data_(NULL),
- eventWaitList_(eventWaitList) {
+ eventWaitList_(eventWaitList),
+ commandWaitBits_(commandWaitBits) {
// Retain the commands from the event wait list.
std::for_each(eventWaitList.begin(), eventWaitList.end(), std::mem_fun(&Command::retain));
}
@@ -218,7 +220,8 @@ void Command::enqueue() {
}
queue_->append(*this);
queue_->flush();
- if (queue_->device().settings().waitCommand_ && (type_ != 0)) {
+ if ((queue_->device().settings().waitCommand_ && (type_ != 0)) ||
+ ((commandWaitBits_ & 0x2) != 0)) {
awaitCompletion();
}
}
@@ -228,7 +231,7 @@ const Context& Command::context() const { return queue_->context(); }
NDRangeKernelCommand::NDRangeKernelCommand(HostQueue& queue, const EventWaitList& eventWaitList,
Kernel& kernel, const NDRangeContainer& sizes,
uint32_t sharedMemBytes, uint32_t extraParam)
- : Command(queue, CL_COMMAND_NDRANGE_KERNEL, eventWaitList)
+ : Command(queue, CL_COMMAND_NDRANGE_KERNEL, eventWaitList, AMD_SERIALIZE_KERNEL)
, kernel_(kernel)
, sizes_(sizes)
, sharedMemBytes_(sharedMemBytes)
diff --git a/rocclr/runtime/platform/command.hpp b/rocclr/runtime/platform/command.hpp
index cf8b2ed507..2a52954d5f 100644
--- a/rocclr/runtime/platform/command.hpp
+++ b/rocclr/runtime/platform/command.hpp
@@ -91,8 +91,8 @@ class Event : public RuntimeObject {
uint64_t submitted_;
uint64_t start_;
uint64_t end_;
- bool enabled_; //!< Profiling enabled for the wave limiter
- uint32_t waves_; //!< The number of waves used in a dispatch
+ bool enabled_; //!< Profiling enabled for the wave limiter
+ uint32_t waves_; //!< The number of waves used in a dispatch
ProfilingCallback* callback_;
void clear() {
queued_ = 0ULL;
@@ -202,9 +202,13 @@ class Command : public Event {
//! The Events that need to complete before this command is submitted.
EventWaitList eventWaitList_;
+ //! Force await completion of previous command
+ //! 0x1 - wait before enqueue, 0x2 - wait after, 0x3 - wait both.
+ uint32_t commandWaitBits_;
+
//! Construct a new command of the given OpenCL type.
- Command(HostQueue& queue, cl_command_type type,
- const EventWaitList& eventWaitList = nullWaitList);
+ Command(HostQueue& queue, cl_command_type type, const EventWaitList& eventWaitList = nullWaitList,
+ uint32_t commandWaitBits = 0);
//! Construct a new command of the given OpenCL type.
Command(cl_command_type type)
@@ -214,7 +218,8 @@ class Command : public Event {
type_(type),
exception_(0),
data_(NULL),
- eventWaitList_(nullWaitList) {}
+ eventWaitList_(nullWaitList),
+ commandWaitBits_(0) {}
bool terminate() {
if (Agent::shouldPostEventEvents() && type() != 0) {
@@ -271,6 +276,9 @@ class Command : public Event {
//! Return the context for this event.
virtual const Context& context() const;
+
+ //! Get command wait bits
+ uint32_t getWaitBits() const { return commandWaitBits_; }
};
class UserEvent : public Command {
@@ -319,7 +327,7 @@ class OneMemoryArgCommand : public Command {
public:
OneMemoryArgCommand(HostQueue& queue, cl_command_type type, const EventWaitList& eventWaitList,
Memory& memory)
- : Command(queue, type, eventWaitList), memory_(&memory) {
+ : Command(queue, type, eventWaitList, AMD_SERIALIZE_COPY), memory_(&memory) {
memory_->retain();
}
@@ -342,7 +350,9 @@ class TwoMemoryArgsCommand : public Command {
public:
TwoMemoryArgsCommand(HostQueue& queue, cl_command_type type, const EventWaitList& eventWaitList,
Memory& memory1, Memory& memory2)
- : Command(queue, type, eventWaitList), memory1_(&memory1), memory2_(&memory2) {
+ : Command(queue, type, eventWaitList, AMD_SERIALIZE_COPY),
+ memory1_(&memory1),
+ memory2_(&memory2) {
memory1_->retain();
memory2_->retain();
}
@@ -788,7 +798,8 @@ class NDRangeKernelCommand : public Command {
//! Return the cooperative multi device groups mode
bool cooperativeMultiDeviceGroups() const {
- return (extraParam_ & CooperativeMultiDeviceGroups) ? true : false; }
+ return (extraParam_ & CooperativeMultiDeviceGroups) ? true : false;
+ }
//! Set the local work size.
void setLocalWorkSize(const NDRange& local) { sizes_.local() = local; }
@@ -995,7 +1006,7 @@ class ThreadTraceMemObjectsCommand : public Command {
private:
std::vector memObjects_; //!< The list of memory objects,bound to the thread trace
size_t sizeMemObjects_; //!< The size of each memory object from memObjects_ list (all memory
- //!objects have the smae size)
+ //! objects have the smae size)
ThreadTrace& threadTrace_; //!< The Thread Trace object
};
@@ -1196,7 +1207,7 @@ class SvmMapMemoryCommand : public Command {
Memory* svmMem_; //!< the pointer to the amd::Memory object corresponding the svm pointer mapped
Coord3D size_; //!< the map size
Coord3D origin_; //!< the origin of the mapped svm pointer shift from the beginning of svm space
- //!allocated
+ //! allocated
cl_map_flags flags_; //!< map flags
void* svmPtr_;
@@ -1315,12 +1326,11 @@ class TransferBufferFileCommand : public OneMemoryArgCommand {
class CopyMemoryP2PCommand : public CopyMemoryCommand {
public:
- CopyMemoryP2PCommand(HostQueue& queue, cl_command_type cmdType, const EventWaitList& eventWaitList,
- Memory& srcMemory, Memory& dstMemory, Coord3D srcOrigin, Coord3D dstOrigin,
- Coord3D size)
- : CopyMemoryCommand(queue, cmdType, eventWaitList, srcMemory, dstMemory, srcOrigin, dstOrigin, size)
- {
- }
+ CopyMemoryP2PCommand(HostQueue& queue, cl_command_type cmdType,
+ const EventWaitList& eventWaitList, Memory& srcMemory, Memory& dstMemory,
+ Coord3D srcOrigin, Coord3D dstOrigin, Coord3D size)
+ : CopyMemoryCommand(queue, cmdType, eventWaitList, srcMemory, dstMemory, srcOrigin, dstOrigin,
+ size) {}
virtual void submit(device::VirtualDevice& device) { device.submitCopyMemoryP2P(*this); }
diff --git a/rocclr/runtime/platform/commandqueue.cpp b/rocclr/runtime/platform/commandqueue.cpp
index 95ed258551..e559d23ffb 100644
--- a/rocclr/runtime/platform/commandqueue.cpp
+++ b/rocclr/runtime/platform/commandqueue.cpp
@@ -35,7 +35,8 @@ bool HostQueue::terminate() {
Command* marker = nullptr;
// Send a finish if the queue is still accepting commands.
- { ScopedLock sl(queueLock_);
+ {
+ ScopedLock sl(queueLock_);
if (thread_.acceptingCommands_) {
marker = new Marker(*this, false);
if (marker != nullptr) {
@@ -50,7 +51,8 @@ bool HostQueue::terminate() {
}
// Wake-up the command loop, so it can exit
- { ScopedLock sl(queueLock_);
+ {
+ ScopedLock sl(queueLock_);
thread_.acceptingCommands_ = false;
queueLock_.notify();
}
@@ -159,6 +161,9 @@ void HostQueue::loop(device::VirtualDevice* virtualDevice) {
void HostQueue::append(Command& command) {
// We retain the command here. It will be released when its status
// changes to CL_COMPLETE
+ if ((command.getWaitBits() & 0x1) != 0) {
+ finish();
+ }
command.retain();
command.setStatus(CL_QUEUED);
queue_.enqueue(&command);
@@ -214,4 +219,4 @@ bool DeviceQueue::create() {
return result;
}
-} // namespace amd {
+} // namespace amd
diff --git a/rocclr/runtime/utils/flags.hpp b/rocclr/runtime/utils/flags.hpp
index 81a7bcb3af..e77ac4d8f2 100644
--- a/rocclr/runtime/utils/flags.hpp
+++ b/rocclr/runtime/utils/flags.hpp
@@ -191,7 +191,13 @@ release(bool, GPU_DUMP_CODE_OBJECT, false, \
"Enable dump code object") \
release(uint, GPU_MAX_USWC_ALLOC_SIZE, 2048, \
"Set a limit in Mb on the maximum USWC allocation size" \
- "-1 = No limit")
+ "-1 = No limit") \
+release(uint, AMD_SERIALIZE_KERNEL, 0, \
+ "Serialize kernel enqueue, 0x1 = Wait for completion before enqueue" \
+ "0x2 = Wait for completion after enqueue 0x3 = both") \
+release(uint, AMD_SERIALIZE_COPY, 0, \
+ "Serialize copies, 0x1 = Wait for completion before enqueue" \
+ "0x2 = Wait for completion after enqueue 0x3 = both")
namespace amd {