SWDEV-301667 - Add cache state for a device
- Add a global cache state for a device to indicate scopes of submitted
AQL packets
- Remove scopes for TS marker if hipEventReleaseToDevice is passed. Set
env ROC_EVENT_NO_FLUSH=1 to use NOP AQL for event records.
It would flush caches by default with system scope release.
- Calling finish() should ensure if caches are flushed, if not queue a
marker
Change-Id: Ibbbdbb1cd7ac61cb35649169212142545be159e0
[ROCm/clr commit: 8eeaa998c0]
Этот коммит содержится в:
@@ -1527,6 +1527,13 @@ class Device : public RuntimeObject {
|
||||
kKernArg = 2
|
||||
} MemorySegment;
|
||||
|
||||
typedef enum CacheState {
|
||||
kCacheStateInvalid = -1,
|
||||
kCacheStateIgnore = 0,
|
||||
kCacheStateAgent = 1,
|
||||
kCacheStateSystem = 2
|
||||
} CacheState;
|
||||
|
||||
typedef std::pair<LinkAttribute, int32_t /* value */> LinkAttrType;
|
||||
|
||||
static constexpr size_t kP2PStagingSize = 4 * Mi;
|
||||
@@ -1749,6 +1756,11 @@ class Device : public RuntimeObject {
|
||||
return false;
|
||||
};
|
||||
|
||||
// Returns bool value if the device cache is equal to the parameter
|
||||
virtual bool IsCacheFlushed(CacheState state) const {
|
||||
return false;
|
||||
};
|
||||
|
||||
virtual void getHwEventTime(const amd::Event& event, uint64_t* start, uint64_t* end) const {};
|
||||
|
||||
virtual const uint32_t getPreferredNumaNode() const { return 0; }
|
||||
|
||||
@@ -180,6 +180,7 @@ Device::Device(hsa_agent_t bkendDevice)
|
||||
gpuvm_segment_.handle = 0;
|
||||
gpu_fine_grained_segment_.handle = 0;
|
||||
prefetch_signal_.handle = 0;
|
||||
cache_state_ = Device::CacheState::kCacheStateInvalid;
|
||||
}
|
||||
|
||||
void Device::setupCpuAgent() {
|
||||
@@ -2635,6 +2636,17 @@ void Device::getHwEventTime(const amd::Event& event, uint64_t* start, uint64_t*
|
||||
start, end);
|
||||
}
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
bool Device::IsCacheFlushed(Device::CacheState state) const {
|
||||
return (static_cast<int>(state) == cache_state_.load(std::memory_order_relaxed));
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
void Device::SetCacheState(Device::CacheState state) {
|
||||
cache_state_.store(static_cast<int>(state), std::memory_order_relaxed);
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
static void callbackQueue(hsa_status_t status, hsa_queue_t* queue, void* data) {
|
||||
if (status != HSA_STATUS_SUCCESS && status != HSA_STATUS_INFO_BREAK) {
|
||||
|
||||
@@ -260,6 +260,8 @@ class NullDevice : public amd::Device {
|
||||
|
||||
virtual bool IsHwEventReady(const amd::Event& event, bool wait = false) const { return false; }
|
||||
virtual void getHwEventTime(const amd::Event& event, uint64_t* start, uint64_t* end) const {};
|
||||
virtual bool IsCacheFlushed(Device::CacheState state) const { return false; };
|
||||
virtual void SetCacheState(Device::CacheState state) {};
|
||||
virtual void ReleaseGlobalSignal(void* signal) const {}
|
||||
|
||||
#if defined(__clang__)
|
||||
@@ -440,6 +442,8 @@ class Device : public NullDevice {
|
||||
|
||||
virtual bool IsHwEventReady(const amd::Event& event, bool wait = false) const;
|
||||
virtual void getHwEventTime(const amd::Event& event, uint64_t* start, uint64_t* end) const;
|
||||
virtual bool IsCacheFlushed(Device::CacheState state) const;
|
||||
virtual void SetCacheState(Device::CacheState state);
|
||||
virtual void ReleaseGlobalSignal(void* signal) const;
|
||||
|
||||
//! Allocate host memory in terms of numa policy set by user
|
||||
@@ -583,6 +587,7 @@ class Device : public NullDevice {
|
||||
hsa_amd_memory_pool_t gpuvm_segment_;
|
||||
hsa_amd_memory_pool_t gpu_fine_grained_segment_;
|
||||
hsa_signal_t prefetch_signal_; //!< Prefetch signal, used to explicitly prefetch SVM on device
|
||||
std::atomic<int> cache_state_; //!< State of cache, kUnknown/kFlushedToDevice/kFlushedToSystem
|
||||
|
||||
size_t gpuvm_segment_max_alloc_;
|
||||
size_t alloc_granularity_;
|
||||
|
||||
@@ -77,6 +77,11 @@ static constexpr uint16_t kBarrierPacketHeader =
|
||||
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
|
||||
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
|
||||
|
||||
static constexpr uint16_t kBarrierPacketAgentScopeHeader =
|
||||
(HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE) | (1 << HSA_PACKET_HEADER_BARRIER) |
|
||||
(HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
|
||||
(HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
|
||||
|
||||
static constexpr uint16_t kNopPacketHeader =
|
||||
(HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE) |
|
||||
(HSA_FENCE_SCOPE_NONE << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
|
||||
@@ -126,7 +131,7 @@ void Timestamp::checkGpuTime() {
|
||||
}
|
||||
// Avoid profiling data for the sync barrier, in tiny performance tests the first call
|
||||
// to ROCr is very slow and that also affects the overall performance of the callback thread
|
||||
if (command().GetBatchHead() == nullptr || command().profilingInfo().marker_ts_) {
|
||||
if (command().GetBatchHead() == nullptr || command().profilingInfo().marker_ts_ > 0) {
|
||||
hsa_amd_profiling_dispatch_time_t time = {};
|
||||
if (it->engine_ == HwQueueEngine::Compute) {
|
||||
hsa_amd_profiling_get_dispatch_time(gpu()->gpu_device(), it->signal_, &time);
|
||||
@@ -442,7 +447,7 @@ hsa_signal_t VirtualGPU::HwQueueTracker::ActiveSignal(
|
||||
// Update the current command/marker with HW event
|
||||
prof_signal->retain();
|
||||
ts->command().SetHwEvent(prof_signal);
|
||||
} else if (ts->command().profilingInfo().marker_ts_ ) {
|
||||
} else if (ts->command().profilingInfo().marker_ts_ > 0 ) {
|
||||
// Update the current command/marker with HW event
|
||||
prof_signal->retain();
|
||||
ts->command().SetHwEvent(prof_signal);
|
||||
@@ -803,11 +808,15 @@ bool VirtualGPU::dispatchGenericAqlPacket(
|
||||
uint64_t index = hsa_queue_add_write_index_screlease(gpu_queue_, size);
|
||||
uint64_t read = hsa_queue_load_read_index_relaxed(gpu_queue_);
|
||||
|
||||
auto cache_state = extractAqlBits(header, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE,
|
||||
HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE);
|
||||
|
||||
if (timestamp_ != nullptr) {
|
||||
// Pool size must grow to the size of pending AQL packets
|
||||
const uint32_t pool_size = index - read;
|
||||
// Get active signal for current dispatch if profiling is necessary
|
||||
packet->completion_signal = Barriers().ActiveSignal(kInitSignalValueOne, timestamp_, pool_size);
|
||||
packet->completion_signal = Barriers().ActiveSignal(kInitSignalValueOne, timestamp_,
|
||||
pool_size);
|
||||
}
|
||||
|
||||
// Make sure the slot is free for usage
|
||||
@@ -845,8 +854,7 @@ bool VirtualGPU::dispatchGenericAqlPacket(
|
||||
HSA_PACKET_HEADER_WIDTH_BARRIER),
|
||||
extractAqlBits(header, HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE,
|
||||
HSA_PACKET_HEADER_WIDTH_SCACQUIRE_FENCE_SCOPE),
|
||||
extractAqlBits(header, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE,
|
||||
HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE),
|
||||
cache_state,
|
||||
rest, reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->grid_size_x,
|
||||
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->grid_size_y,
|
||||
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->grid_size_z,
|
||||
@@ -863,6 +871,8 @@ bool VirtualGPU::dispatchGenericAqlPacket(
|
||||
//hsa_queue_store_write_index_release(gpu_queue_, index);
|
||||
hsa_signal_store_screlease(gpu_queue_->doorbell_signal, index - 1);
|
||||
|
||||
roc_device_.SetCacheState(static_cast<Device::CacheState>(cache_state));
|
||||
|
||||
// Wait on signal ?
|
||||
if (blocking) {
|
||||
LogInfo("Runtime reachead the AQL queue limit. SW is much ahead of HW. Blocking AQL queue!");
|
||||
@@ -957,6 +967,8 @@ void VirtualGPU::dispatchBarrierPacket(uint16_t packetHeader, bool skipSignal,
|
||||
uint64_t index = hsa_queue_add_write_index_screlease(gpu_queue_, 1);
|
||||
uint64_t read = hsa_queue_load_read_index_relaxed(gpu_queue_);
|
||||
|
||||
auto cache_state = extractAqlBits(packetHeader, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE,
|
||||
HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE);
|
||||
if (!skipSignal) {
|
||||
// Pool size must grow to the size of pending AQL packets
|
||||
const uint32_t pool_size = index - read;
|
||||
@@ -987,11 +999,13 @@ void VirtualGPU::dispatchBarrierPacket(uint16_t packetHeader, bool skipSignal,
|
||||
HSA_PACKET_HEADER_WIDTH_BARRIER),
|
||||
extractAqlBits(packetHeader, HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE,
|
||||
HSA_PACKET_HEADER_WIDTH_SCACQUIRE_FENCE_SCOPE),
|
||||
extractAqlBits(packetHeader, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE,
|
||||
HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE),
|
||||
cache_state,
|
||||
barrier_packet_.dep_signal[0], barrier_packet_.dep_signal[1],
|
||||
barrier_packet_.dep_signal[2], barrier_packet_.dep_signal[3],
|
||||
barrier_packet_.dep_signal[4], barrier_packet_.completion_signal);
|
||||
|
||||
roc_device_.SetCacheState(static_cast<Device::CacheState>(cache_state));
|
||||
|
||||
// Clear dependent signals for the next packet
|
||||
barrier_packet_.dep_signal[0] = hsa_signal_t{};
|
||||
barrier_packet_.dep_signal[1] = hsa_signal_t{};
|
||||
@@ -2303,7 +2317,12 @@ void VirtualGPU::dispatchBarrierValuePacket(const hsa_amd_barrier_value_packet_t
|
||||
unsigned int* headerPtr = reinterpret_cast<unsigned int*>(&header);
|
||||
__atomic_store_n(reinterpret_cast<uint32_t*>(aql_loc), *headerPtr, __ATOMIC_RELEASE);
|
||||
|
||||
auto cache_state = extractAqlBits(header.header, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE,
|
||||
HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE);
|
||||
|
||||
hsa_signal_store_screlease(gpu_queue_->doorbell_signal, index);
|
||||
roc_device_.SetCacheState(static_cast<Device::CacheState>(cache_state));
|
||||
|
||||
ClPrint(amd::LOG_DEBUG, amd::LOG_AQL,
|
||||
"HWq=0x%zx, BarrierValue Header = 0x%x AmdFormat = 0x%x ",
|
||||
"(type=%d, barrier=%d, acquire=%d, release=%d), "
|
||||
@@ -2313,8 +2332,7 @@ void VirtualGPU::dispatchBarrierValuePacket(const hsa_amd_barrier_value_packet_t
|
||||
extractAqlBits(header.header, HSA_PACKET_HEADER_BARRIER, HSA_PACKET_HEADER_WIDTH_BARRIER),
|
||||
extractAqlBits(header.header, HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE,
|
||||
HSA_PACKET_HEADER_WIDTH_SCACQUIRE_FENCE_SCOPE),
|
||||
extractAqlBits(header.header, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE,
|
||||
HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE),
|
||||
cache_state,
|
||||
packet->completion_signal, packet->value, packet->mask, packet->cond,
|
||||
HSA_SIGNAL_CONDITION_GTE, HSA_SIGNAL_CONDITION_EQ, HSA_SIGNAL_CONDITION_NE);
|
||||
}
|
||||
@@ -3096,10 +3114,16 @@ void VirtualGPU::submitMarker(amd::Marker& vcmd) {
|
||||
} else {
|
||||
profilingBegin(vcmd);
|
||||
if (timestamp_ != nullptr) {
|
||||
// Submit a barrier with a cache flushes.
|
||||
dispatchBarrierPacket(kBarrierPacketHeader, false);
|
||||
|
||||
hasPendingDispatch_ = false;
|
||||
uint32_t releaseFlags = vcmd.profilingInfo().marker_ts_;
|
||||
if (ROC_EVENT_NO_FLUSH && releaseFlags == Device::CacheState::kCacheStateIgnore) {
|
||||
dispatchBarrierPacket(kNopPacketHeader, false);
|
||||
} else if (releaseFlags == Device::CacheState::kCacheStateAgent) {
|
||||
dispatchBarrierPacket(kBarrierPacketAgentScopeHeader, false);
|
||||
} else {
|
||||
// Submit a barrier with a cache flushes.
|
||||
dispatchBarrierPacket(kBarrierPacketHeader, false);
|
||||
hasPendingDispatch_ = false;
|
||||
}
|
||||
}
|
||||
profilingEnd(vcmd);
|
||||
}
|
||||
|
||||
@@ -55,8 +55,14 @@ Event::Event(HostQueue& queue)
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
Event::Event() : callbacks_(NULL), status_(CL_SUBMITTED),
|
||||
hw_event_(nullptr), notify_event_(nullptr), device_(nullptr) { notified_.clear(); }
|
||||
Event::Event()
|
||||
: callbacks_(NULL),
|
||||
status_(CL_SUBMITTED),
|
||||
hw_event_(nullptr),
|
||||
notify_event_(nullptr),
|
||||
device_(nullptr) {
|
||||
notified_.clear();
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
Event::~Event() {
|
||||
@@ -241,7 +247,7 @@ bool Event::awaitCompletion() {
|
||||
return false;
|
||||
}
|
||||
|
||||
ClPrint(LOG_DEBUG, LOG_WAIT, "waiting for event %p to complete, current status %d",
|
||||
ClPrint(LOG_DEBUG, LOG_WAIT, "Waiting for event %p to complete, current status %d",
|
||||
this, status());
|
||||
auto* queue = command().queue();
|
||||
if ((queue != nullptr) && queue->vdev()->ActiveWait()) {
|
||||
@@ -256,7 +262,7 @@ bool Event::awaitCompletion() {
|
||||
lock_.wait();
|
||||
}
|
||||
}
|
||||
ClPrint(LOG_DEBUG, LOG_WAIT, "event %p wait completed", this);
|
||||
ClPrint(LOG_DEBUG, LOG_WAIT, "Event %p wait completed", this);
|
||||
}
|
||||
|
||||
return status() == CL_COMPLETE;
|
||||
@@ -277,7 +283,7 @@ bool Event::notifyCmdQueue(bool cpu_wait) {
|
||||
notified_.clear();
|
||||
return false;
|
||||
}
|
||||
ClPrint(LOG_DEBUG, LOG_CMD, "queue marker to command queue: %p", queue);
|
||||
ClPrint(LOG_DEBUG, LOG_CMD, "Queue marker to command queue: %p", queue);
|
||||
command->enqueue();
|
||||
// Save notification, associated with the current event
|
||||
notify_event_ = command;
|
||||
@@ -290,7 +296,7 @@ bool Event::notifyCmdQueue(bool cpu_wait) {
|
||||
notified_.clear();
|
||||
return false;
|
||||
}
|
||||
ClPrint(LOG_DEBUG, LOG_CMD, "queue marker to command queue: %p", queue);
|
||||
ClPrint(LOG_DEBUG, LOG_CMD, "Queue marker to command queue: %p", queue);
|
||||
command->enqueue();
|
||||
command->release();
|
||||
}
|
||||
@@ -336,7 +342,7 @@ void Command::enqueue() {
|
||||
Agent::postEventCreate(as_cl(static_cast<Event*>(this)), type_);
|
||||
}
|
||||
|
||||
ClPrint(LOG_DEBUG, LOG_CMD, "command is enqueued: %p", this);
|
||||
ClPrint(LOG_DEBUG, LOG_CMD, "Command enqueued: %p", this);
|
||||
|
||||
// Direct dispatch logic below will submit the command immediately, but the command status
|
||||
// update will occur later after flush() with a wait
|
||||
@@ -360,7 +366,7 @@ void Command::enqueue() {
|
||||
EnableProfiling();
|
||||
}
|
||||
|
||||
if (isMarker && !profilingInfo().marker_ts_) {
|
||||
if (isMarker && (profilingInfo().marker_ts_ == 0)) {
|
||||
// Update batch head for the current marker. Hence the status of all commands can be
|
||||
// updated upon the marker completion
|
||||
SetBatchHead(queue_->GetSubmittionBatch());
|
||||
@@ -414,6 +420,7 @@ NDRangeKernelCommand::NDRangeKernelCommand(HostQueue& queue, const EventWaitList
|
||||
profilingInfo_.enabled_ = true;
|
||||
profilingInfo_.clear();
|
||||
profilingInfo_.callback_ = nullptr;
|
||||
profilingInfo_.marker_ts_ = 1;
|
||||
}
|
||||
kernel_.retain();
|
||||
}
|
||||
|
||||
@@ -102,7 +102,7 @@ class Event : public RuntimeObject {
|
||||
static const EventWaitList nullWaitList;
|
||||
|
||||
struct ProfilingInfo {
|
||||
ProfilingInfo(bool enabled = false) : enabled_(enabled), waves_(0), marker_ts_(false) {
|
||||
ProfilingInfo(bool enabled = false) : enabled_(enabled), waves_(0), marker_ts_(0) {
|
||||
if (enabled) {
|
||||
clear();
|
||||
callback_ = nullptr;
|
||||
@@ -113,10 +113,11 @@ class Event : public RuntimeObject {
|
||||
uint64_t submitted_;
|
||||
uint64_t start_;
|
||||
uint64_t end_;
|
||||
bool enabled_; //!< Profiling enabled for the wave limiter
|
||||
uint32_t waves_; //!< The number of waves used in a dispatch
|
||||
bool enabled_; //!< Profiling enabled for the wave limiter
|
||||
uint32_t waves_; //!< The number of waves used in a dispatch
|
||||
ProfilingCallback* callback_;
|
||||
bool marker_ts_;
|
||||
uint32_t marker_ts_; //!< Marker with release scope
|
||||
//!< 5 - system scope, 3 - device scope, 1 - no scopes
|
||||
void clear() {
|
||||
queued_ = 0ULL;
|
||||
submitted_ = 0ULL;
|
||||
|
||||
@@ -106,20 +106,20 @@ bool HostQueue::terminate() {
|
||||
|
||||
void HostQueue::finish() {
|
||||
Command* command = nullptr;
|
||||
bool isCacheFlushed = device().IsCacheFlushed(Device::CacheState::kCacheStateSystem);
|
||||
if (IS_HIP) {
|
||||
command = getLastQueuedCommand(true);
|
||||
// Check if the queue has nothing to process and return
|
||||
if (AMD_DIRECT_DISPATCH && command == nullptr) {
|
||||
if (AMD_DIRECT_DISPATCH && isCacheFlushed && command == nullptr) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
if (nullptr == command) {
|
||||
if (nullptr == command || !isCacheFlushed) {
|
||||
// Send a finish to make sure we finished all commands
|
||||
command = new Marker(*this, false);
|
||||
if (command == NULL) {
|
||||
return;
|
||||
}
|
||||
ClPrint(LOG_DEBUG, LOG_CMD, "marker is queued");
|
||||
ClPrint(LOG_DEBUG, LOG_CMD, "Marker queued, Cache Flushed = %d", isCacheFlushed);
|
||||
command->enqueue();
|
||||
}
|
||||
// Check HW status of the ROCcrl event. Note: not all ROCclr modes support HW status
|
||||
@@ -194,7 +194,7 @@ void HostQueue::loop(device::VirtualDevice* virtualDevice) {
|
||||
continue;
|
||||
}
|
||||
|
||||
ClPrint(LOG_DEBUG, LOG_CMD, "command (%s) is submitted: %p", getOclCommandKindString(command->type()), command);
|
||||
ClPrint(LOG_DEBUG, LOG_CMD, "Command (%s) submitted: %p", getOclCommandKindString(command->type()), command);
|
||||
|
||||
command->setStatus(CL_SUBMITTED);
|
||||
|
||||
|
||||
@@ -273,7 +273,9 @@ release(uint, ROC_AQL_QUEUE_SIZE, 4096, \
|
||||
release(bool, ROC_SKIP_KERNEL_ARG_COPY, false, \
|
||||
"If true, then runtime can skip kernel arg copy") \
|
||||
release(bool, GPU_STREAMOPS_CP_WAIT, false, \
|
||||
"Force the stream wait memory operation to wait on CP.")
|
||||
"Force the stream wait memory operation to wait on CP.") \
|
||||
release(bool, ROC_EVENT_NO_FLUSH, false, \
|
||||
"Use NOP AQL packet for event records with no explicit flags.")
|
||||
|
||||
namespace amd {
|
||||
|
||||
|
||||
Ссылка в новой задаче
Block a user