diff --git a/projects/clr/rocclr/device/device.hpp b/projects/clr/rocclr/device/device.hpp
index 4db0dc6bc8..8e60af681b 100644
--- a/projects/clr/rocclr/device/device.hpp
+++ b/projects/clr/rocclr/device/device.hpp
@@ -1527,6 +1527,13 @@ class Device : public RuntimeObject {
kKernArg = 2
} MemorySegment;
+ typedef enum CacheState {
+ kCacheStateInvalid = -1,
+ kCacheStateIgnore = 0,
+ kCacheStateAgent = 1,
+ kCacheStateSystem = 2
+ } CacheState;
+
typedef std::pair LinkAttrType;
static constexpr size_t kP2PStagingSize = 4 * Mi;
@@ -1749,6 +1756,11 @@ class Device : public RuntimeObject {
return false;
};
+ // Returns bool value if the device cache is equal to the parameter
+ virtual bool IsCacheFlushed(CacheState state) const {
+ return false;
+ };
+
virtual void getHwEventTime(const amd::Event& event, uint64_t* start, uint64_t* end) const {};
virtual const uint32_t getPreferredNumaNode() const { return 0; }
diff --git a/projects/clr/rocclr/device/rocm/rocdevice.cpp b/projects/clr/rocclr/device/rocm/rocdevice.cpp
index 8b7c33f7cc..4045d7cc99 100644
--- a/projects/clr/rocclr/device/rocm/rocdevice.cpp
+++ b/projects/clr/rocclr/device/rocm/rocdevice.cpp
@@ -180,6 +180,7 @@ Device::Device(hsa_agent_t bkendDevice)
gpuvm_segment_.handle = 0;
gpu_fine_grained_segment_.handle = 0;
prefetch_signal_.handle = 0;
+ cache_state_ = Device::CacheState::kCacheStateInvalid;
}
void Device::setupCpuAgent() {
@@ -2635,6 +2636,17 @@ void Device::getHwEventTime(const amd::Event& event, uint64_t* start, uint64_t*
start, end);
}
}
+
+// ================================================================================================
+bool Device::IsCacheFlushed(Device::CacheState state) const {
+ return (static_cast(state) == cache_state_.load(std::memory_order_relaxed));
+}
+
+// ================================================================================================
+void Device::SetCacheState(Device::CacheState state) {
+ cache_state_.store(static_cast(state), std::memory_order_relaxed);
+}
+
// ================================================================================================
static void callbackQueue(hsa_status_t status, hsa_queue_t* queue, void* data) {
if (status != HSA_STATUS_SUCCESS && status != HSA_STATUS_INFO_BREAK) {
diff --git a/projects/clr/rocclr/device/rocm/rocdevice.hpp b/projects/clr/rocclr/device/rocm/rocdevice.hpp
index 5004941a90..b908cad8d0 100644
--- a/projects/clr/rocclr/device/rocm/rocdevice.hpp
+++ b/projects/clr/rocclr/device/rocm/rocdevice.hpp
@@ -260,6 +260,8 @@ class NullDevice : public amd::Device {
virtual bool IsHwEventReady(const amd::Event& event, bool wait = false) const { return false; }
virtual void getHwEventTime(const amd::Event& event, uint64_t* start, uint64_t* end) const {};
+ virtual bool IsCacheFlushed(Device::CacheState state) const { return false; };
+ virtual void SetCacheState(Device::CacheState state) {};
virtual void ReleaseGlobalSignal(void* signal) const {}
#if defined(__clang__)
@@ -440,6 +442,8 @@ class Device : public NullDevice {
virtual bool IsHwEventReady(const amd::Event& event, bool wait = false) const;
virtual void getHwEventTime(const amd::Event& event, uint64_t* start, uint64_t* end) const;
+ virtual bool IsCacheFlushed(Device::CacheState state) const;
+ virtual void SetCacheState(Device::CacheState state);
virtual void ReleaseGlobalSignal(void* signal) const;
//! Allocate host memory in terms of numa policy set by user
@@ -583,6 +587,7 @@ class Device : public NullDevice {
hsa_amd_memory_pool_t gpuvm_segment_;
hsa_amd_memory_pool_t gpu_fine_grained_segment_;
hsa_signal_t prefetch_signal_; //!< Prefetch signal, used to explicitly prefetch SVM on device
+ std::atomic cache_state_; //!< State of cache, kUnknown/kFlushedToDevice/kFlushedToSystem
size_t gpuvm_segment_max_alloc_;
size_t alloc_granularity_;
diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.cpp b/projects/clr/rocclr/device/rocm/rocvirtual.cpp
index b8b5d14341..7619abd224 100644
--- a/projects/clr/rocclr/device/rocm/rocvirtual.cpp
+++ b/projects/clr/rocclr/device/rocm/rocvirtual.cpp
@@ -77,6 +77,11 @@ static constexpr uint16_t kBarrierPacketHeader =
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
+static constexpr uint16_t kBarrierPacketAgentScopeHeader =
+ (HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE) | (1 << HSA_PACKET_HEADER_BARRIER) |
+ (HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
+ (HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
+
static constexpr uint16_t kNopPacketHeader =
(HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE) |
(HSA_FENCE_SCOPE_NONE << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
@@ -126,7 +131,7 @@ void Timestamp::checkGpuTime() {
}
// Avoid profiling data for the sync barrier, in tiny performance tests the first call
// to ROCr is very slow and that also affects the overall performance of the callback thread
- if (command().GetBatchHead() == nullptr || command().profilingInfo().marker_ts_) {
+ if (command().GetBatchHead() == nullptr || command().profilingInfo().marker_ts_ > 0) {
hsa_amd_profiling_dispatch_time_t time = {};
if (it->engine_ == HwQueueEngine::Compute) {
hsa_amd_profiling_get_dispatch_time(gpu()->gpu_device(), it->signal_, &time);
@@ -442,7 +447,7 @@ hsa_signal_t VirtualGPU::HwQueueTracker::ActiveSignal(
// Update the current command/marker with HW event
prof_signal->retain();
ts->command().SetHwEvent(prof_signal);
- } else if (ts->command().profilingInfo().marker_ts_ ) {
+ } else if (ts->command().profilingInfo().marker_ts_ > 0 ) {
// Update the current command/marker with HW event
prof_signal->retain();
ts->command().SetHwEvent(prof_signal);
@@ -803,11 +808,15 @@ bool VirtualGPU::dispatchGenericAqlPacket(
uint64_t index = hsa_queue_add_write_index_screlease(gpu_queue_, size);
uint64_t read = hsa_queue_load_read_index_relaxed(gpu_queue_);
+ auto cache_state = extractAqlBits(header, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE,
+ HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE);
+
if (timestamp_ != nullptr) {
// Pool size must grow to the size of pending AQL packets
const uint32_t pool_size = index - read;
// Get active signal for current dispatch if profiling is necessary
- packet->completion_signal = Barriers().ActiveSignal(kInitSignalValueOne, timestamp_, pool_size);
+ packet->completion_signal = Barriers().ActiveSignal(kInitSignalValueOne, timestamp_,
+ pool_size);
}
// Make sure the slot is free for usage
@@ -845,8 +854,7 @@ bool VirtualGPU::dispatchGenericAqlPacket(
HSA_PACKET_HEADER_WIDTH_BARRIER),
extractAqlBits(header, HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE,
HSA_PACKET_HEADER_WIDTH_SCACQUIRE_FENCE_SCOPE),
- extractAqlBits(header, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE,
- HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE),
+ cache_state,
rest, reinterpret_cast(packet)->grid_size_x,
reinterpret_cast(packet)->grid_size_y,
reinterpret_cast(packet)->grid_size_z,
@@ -863,6 +871,8 @@ bool VirtualGPU::dispatchGenericAqlPacket(
//hsa_queue_store_write_index_release(gpu_queue_, index);
hsa_signal_store_screlease(gpu_queue_->doorbell_signal, index - 1);
+ roc_device_.SetCacheState(static_cast(cache_state));
+
// Wait on signal ?
if (blocking) {
LogInfo("Runtime reachead the AQL queue limit. SW is much ahead of HW. Blocking AQL queue!");
@@ -957,6 +967,8 @@ void VirtualGPU::dispatchBarrierPacket(uint16_t packetHeader, bool skipSignal,
uint64_t index = hsa_queue_add_write_index_screlease(gpu_queue_, 1);
uint64_t read = hsa_queue_load_read_index_relaxed(gpu_queue_);
+ auto cache_state = extractAqlBits(packetHeader, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE,
+ HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE);
if (!skipSignal) {
// Pool size must grow to the size of pending AQL packets
const uint32_t pool_size = index - read;
@@ -987,11 +999,13 @@ void VirtualGPU::dispatchBarrierPacket(uint16_t packetHeader, bool skipSignal,
HSA_PACKET_HEADER_WIDTH_BARRIER),
extractAqlBits(packetHeader, HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE,
HSA_PACKET_HEADER_WIDTH_SCACQUIRE_FENCE_SCOPE),
- extractAqlBits(packetHeader, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE,
- HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE),
+ cache_state,
barrier_packet_.dep_signal[0], barrier_packet_.dep_signal[1],
barrier_packet_.dep_signal[2], barrier_packet_.dep_signal[3],
barrier_packet_.dep_signal[4], barrier_packet_.completion_signal);
+
+ roc_device_.SetCacheState(static_cast(cache_state));
+
// Clear dependent signals for the next packet
barrier_packet_.dep_signal[0] = hsa_signal_t{};
barrier_packet_.dep_signal[1] = hsa_signal_t{};
@@ -2303,7 +2317,12 @@ void VirtualGPU::dispatchBarrierValuePacket(const hsa_amd_barrier_value_packet_t
unsigned int* headerPtr = reinterpret_cast(&header);
__atomic_store_n(reinterpret_cast(aql_loc), *headerPtr, __ATOMIC_RELEASE);
+ auto cache_state = extractAqlBits(header.header, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE,
+ HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE);
+
hsa_signal_store_screlease(gpu_queue_->doorbell_signal, index);
+ roc_device_.SetCacheState(static_cast(cache_state));
+
ClPrint(amd::LOG_DEBUG, amd::LOG_AQL,
"HWq=0x%zx, BarrierValue Header = 0x%x AmdFormat = 0x%x ",
"(type=%d, barrier=%d, acquire=%d, release=%d), "
@@ -2313,8 +2332,7 @@ void VirtualGPU::dispatchBarrierValuePacket(const hsa_amd_barrier_value_packet_t
extractAqlBits(header.header, HSA_PACKET_HEADER_BARRIER, HSA_PACKET_HEADER_WIDTH_BARRIER),
extractAqlBits(header.header, HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE,
HSA_PACKET_HEADER_WIDTH_SCACQUIRE_FENCE_SCOPE),
- extractAqlBits(header.header, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE,
- HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE),
+ cache_state,
packet->completion_signal, packet->value, packet->mask, packet->cond,
HSA_SIGNAL_CONDITION_GTE, HSA_SIGNAL_CONDITION_EQ, HSA_SIGNAL_CONDITION_NE);
}
@@ -3096,10 +3114,16 @@ void VirtualGPU::submitMarker(amd::Marker& vcmd) {
} else {
profilingBegin(vcmd);
if (timestamp_ != nullptr) {
- // Submit a barrier with a cache flushes.
- dispatchBarrierPacket(kBarrierPacketHeader, false);
-
- hasPendingDispatch_ = false;
+ uint32_t releaseFlags = vcmd.profilingInfo().marker_ts_;
+ if (ROC_EVENT_NO_FLUSH && releaseFlags == Device::CacheState::kCacheStateIgnore) {
+ dispatchBarrierPacket(kNopPacketHeader, false);
+ } else if (releaseFlags == Device::CacheState::kCacheStateAgent) {
+ dispatchBarrierPacket(kBarrierPacketAgentScopeHeader, false);
+ } else {
+ // Submit a barrier with a cache flushes.
+ dispatchBarrierPacket(kBarrierPacketHeader, false);
+ hasPendingDispatch_ = false;
+ }
}
profilingEnd(vcmd);
}
diff --git a/projects/clr/rocclr/platform/command.cpp b/projects/clr/rocclr/platform/command.cpp
index eec70f483d..cdc09f4215 100644
--- a/projects/clr/rocclr/platform/command.cpp
+++ b/projects/clr/rocclr/platform/command.cpp
@@ -55,8 +55,14 @@ Event::Event(HostQueue& queue)
}
// ================================================================================================
-Event::Event() : callbacks_(NULL), status_(CL_SUBMITTED),
- hw_event_(nullptr), notify_event_(nullptr), device_(nullptr) { notified_.clear(); }
+Event::Event()
+ : callbacks_(NULL),
+ status_(CL_SUBMITTED),
+ hw_event_(nullptr),
+ notify_event_(nullptr),
+ device_(nullptr) {
+ notified_.clear();
+}
// ================================================================================================
Event::~Event() {
@@ -241,7 +247,7 @@ bool Event::awaitCompletion() {
return false;
}
- ClPrint(LOG_DEBUG, LOG_WAIT, "waiting for event %p to complete, current status %d",
+ ClPrint(LOG_DEBUG, LOG_WAIT, "Waiting for event %p to complete, current status %d",
this, status());
auto* queue = command().queue();
if ((queue != nullptr) && queue->vdev()->ActiveWait()) {
@@ -256,7 +262,7 @@ bool Event::awaitCompletion() {
lock_.wait();
}
}
- ClPrint(LOG_DEBUG, LOG_WAIT, "event %p wait completed", this);
+ ClPrint(LOG_DEBUG, LOG_WAIT, "Event %p wait completed", this);
}
return status() == CL_COMPLETE;
@@ -277,7 +283,7 @@ bool Event::notifyCmdQueue(bool cpu_wait) {
notified_.clear();
return false;
}
- ClPrint(LOG_DEBUG, LOG_CMD, "queue marker to command queue: %p", queue);
+ ClPrint(LOG_DEBUG, LOG_CMD, "Queue marker to command queue: %p", queue);
command->enqueue();
// Save notification, associated with the current event
notify_event_ = command;
@@ -290,7 +296,7 @@ bool Event::notifyCmdQueue(bool cpu_wait) {
notified_.clear();
return false;
}
- ClPrint(LOG_DEBUG, LOG_CMD, "queue marker to command queue: %p", queue);
+ ClPrint(LOG_DEBUG, LOG_CMD, "Queue marker to command queue: %p", queue);
command->enqueue();
command->release();
}
@@ -336,7 +342,7 @@ void Command::enqueue() {
Agent::postEventCreate(as_cl(static_cast(this)), type_);
}
- ClPrint(LOG_DEBUG, LOG_CMD, "command is enqueued: %p", this);
+ ClPrint(LOG_DEBUG, LOG_CMD, "Command enqueued: %p", this);
// Direct dispatch logic below will submit the command immediately, but the command status
// update will occur later after flush() with a wait
@@ -360,7 +366,7 @@ void Command::enqueue() {
EnableProfiling();
}
- if (isMarker && !profilingInfo().marker_ts_) {
+ if (isMarker && (profilingInfo().marker_ts_ == 0)) {
// Update batch head for the current marker. Hence the status of all commands can be
// updated upon the marker completion
SetBatchHead(queue_->GetSubmittionBatch());
@@ -414,6 +420,7 @@ NDRangeKernelCommand::NDRangeKernelCommand(HostQueue& queue, const EventWaitList
profilingInfo_.enabled_ = true;
profilingInfo_.clear();
profilingInfo_.callback_ = nullptr;
+ profilingInfo_.marker_ts_ = 1;
}
kernel_.retain();
}
diff --git a/projects/clr/rocclr/platform/command.hpp b/projects/clr/rocclr/platform/command.hpp
index 395d4642e0..4ea2636ddc 100644
--- a/projects/clr/rocclr/platform/command.hpp
+++ b/projects/clr/rocclr/platform/command.hpp
@@ -102,7 +102,7 @@ class Event : public RuntimeObject {
static const EventWaitList nullWaitList;
struct ProfilingInfo {
- ProfilingInfo(bool enabled = false) : enabled_(enabled), waves_(0), marker_ts_(false) {
+ ProfilingInfo(bool enabled = false) : enabled_(enabled), waves_(0), marker_ts_(0) {
if (enabled) {
clear();
callback_ = nullptr;
@@ -113,10 +113,11 @@ class Event : public RuntimeObject {
uint64_t submitted_;
uint64_t start_;
uint64_t end_;
- bool enabled_; //!< Profiling enabled for the wave limiter
- uint32_t waves_; //!< The number of waves used in a dispatch
+ bool enabled_; //!< Profiling enabled for the wave limiter
+ uint32_t waves_; //!< The number of waves used in a dispatch
ProfilingCallback* callback_;
- bool marker_ts_;
+ uint32_t marker_ts_; //!< Marker with release scope
+ //!< 5 - system scope, 3 - device scope, 1 - no scopes
void clear() {
queued_ = 0ULL;
submitted_ = 0ULL;
diff --git a/projects/clr/rocclr/platform/commandqueue.cpp b/projects/clr/rocclr/platform/commandqueue.cpp
index e8b914971d..daae776bd5 100644
--- a/projects/clr/rocclr/platform/commandqueue.cpp
+++ b/projects/clr/rocclr/platform/commandqueue.cpp
@@ -106,20 +106,20 @@ bool HostQueue::terminate() {
void HostQueue::finish() {
Command* command = nullptr;
+ bool isCacheFlushed = device().IsCacheFlushed(Device::CacheState::kCacheStateSystem);
if (IS_HIP) {
command = getLastQueuedCommand(true);
- // Check if the queue has nothing to process and return
- if (AMD_DIRECT_DISPATCH && command == nullptr) {
+ if (AMD_DIRECT_DISPATCH && isCacheFlushed && command == nullptr) {
return;
}
}
- if (nullptr == command) {
+ if (nullptr == command || !isCacheFlushed) {
// Send a finish to make sure we finished all commands
command = new Marker(*this, false);
if (command == NULL) {
return;
}
- ClPrint(LOG_DEBUG, LOG_CMD, "marker is queued");
+ ClPrint(LOG_DEBUG, LOG_CMD, "Marker queued, Cache Flushed = %d", isCacheFlushed);
command->enqueue();
}
// Check HW status of the ROCcrl event. Note: not all ROCclr modes support HW status
@@ -194,7 +194,7 @@ void HostQueue::loop(device::VirtualDevice* virtualDevice) {
continue;
}
- ClPrint(LOG_DEBUG, LOG_CMD, "command (%s) is submitted: %p", getOclCommandKindString(command->type()), command);
+ ClPrint(LOG_DEBUG, LOG_CMD, "Command (%s) submitted: %p", getOclCommandKindString(command->type()), command);
command->setStatus(CL_SUBMITTED);
diff --git a/projects/clr/rocclr/utils/flags.hpp b/projects/clr/rocclr/utils/flags.hpp
index 147349cc6b..0912657d2e 100644
--- a/projects/clr/rocclr/utils/flags.hpp
+++ b/projects/clr/rocclr/utils/flags.hpp
@@ -273,7 +273,9 @@ release(uint, ROC_AQL_QUEUE_SIZE, 4096, \
release(bool, ROC_SKIP_KERNEL_ARG_COPY, false, \
"If true, then runtime can skip kernel arg copy") \
release(bool, GPU_STREAMOPS_CP_WAIT, false, \
- "Force the stream wait memory operation to wait on CP.")
+ "Force the stream wait memory operation to wait on CP.") \
+release(bool, ROC_EVENT_NO_FLUSH, false, \
+ "Use NOP AQL packet for event records with no explicit flags.")
namespace amd {