SWDEV-301667 - Add cache state for a device

- Add a global cache state for a device to indicate scopes of submitted
AQL packets
- Remove scopes for TS marker if hipEventReleaseToDevice is passed. Set
env ROC_EVENT_NO_FLUSH=1 to use NOP AQL for event records.
It would flush caches by default with system scope release.
- Calling finish() should ensure if caches are flushed, if not queue a
marker

Change-Id: Ibbbdbb1cd7ac61cb35649169212142545be159e0


[ROCm/clr commit: 8eeaa998c0]
Этот коммит содержится в:
Saleel Kudchadker
2022-03-29 22:48:19 -07:00
родитель cb7802bc8a
Коммит 3d0100c5ab
8 изменённых файлов: 94 добавлений и 31 удалений
+12
Просмотреть файл
@@ -1527,6 +1527,13 @@ class Device : public RuntimeObject {
kKernArg = 2
} MemorySegment;
typedef enum CacheState {
kCacheStateInvalid = -1,
kCacheStateIgnore = 0,
kCacheStateAgent = 1,
kCacheStateSystem = 2
} CacheState;
typedef std::pair<LinkAttribute, int32_t /* value */> LinkAttrType;
static constexpr size_t kP2PStagingSize = 4 * Mi;
@@ -1749,6 +1756,11 @@ class Device : public RuntimeObject {
return false;
};
// Returns bool value if the device cache is equal to the parameter
virtual bool IsCacheFlushed(CacheState state) const {
return false;
};
virtual void getHwEventTime(const amd::Event& event, uint64_t* start, uint64_t* end) const {};
virtual const uint32_t getPreferredNumaNode() const { return 0; }
+12
Просмотреть файл
@@ -180,6 +180,7 @@ Device::Device(hsa_agent_t bkendDevice)
gpuvm_segment_.handle = 0;
gpu_fine_grained_segment_.handle = 0;
prefetch_signal_.handle = 0;
cache_state_ = Device::CacheState::kCacheStateInvalid;
}
void Device::setupCpuAgent() {
@@ -2635,6 +2636,17 @@ void Device::getHwEventTime(const amd::Event& event, uint64_t* start, uint64_t*
start, end);
}
}
// ================================================================================================
bool Device::IsCacheFlushed(Device::CacheState state) const {
return (static_cast<int>(state) == cache_state_.load(std::memory_order_relaxed));
}
// ================================================================================================
void Device::SetCacheState(Device::CacheState state) {
cache_state_.store(static_cast<int>(state), std::memory_order_relaxed);
}
// ================================================================================================
static void callbackQueue(hsa_status_t status, hsa_queue_t* queue, void* data) {
if (status != HSA_STATUS_SUCCESS && status != HSA_STATUS_INFO_BREAK) {
+5
Просмотреть файл
@@ -260,6 +260,8 @@ class NullDevice : public amd::Device {
virtual bool IsHwEventReady(const amd::Event& event, bool wait = false) const { return false; }
virtual void getHwEventTime(const amd::Event& event, uint64_t* start, uint64_t* end) const {};
virtual bool IsCacheFlushed(Device::CacheState state) const { return false; };
virtual void SetCacheState(Device::CacheState state) {};
virtual void ReleaseGlobalSignal(void* signal) const {}
#if defined(__clang__)
@@ -440,6 +442,8 @@ class Device : public NullDevice {
virtual bool IsHwEventReady(const amd::Event& event, bool wait = false) const;
virtual void getHwEventTime(const amd::Event& event, uint64_t* start, uint64_t* end) const;
virtual bool IsCacheFlushed(Device::CacheState state) const;
virtual void SetCacheState(Device::CacheState state);
virtual void ReleaseGlobalSignal(void* signal) const;
//! Allocate host memory in terms of numa policy set by user
@@ -583,6 +587,7 @@ class Device : public NullDevice {
hsa_amd_memory_pool_t gpuvm_segment_;
hsa_amd_memory_pool_t gpu_fine_grained_segment_;
hsa_signal_t prefetch_signal_; //!< Prefetch signal, used to explicitly prefetch SVM on device
std::atomic<int> cache_state_; //!< State of cache, kUnknown/kFlushedToDevice/kFlushedToSystem
size_t gpuvm_segment_max_alloc_;
size_t alloc_granularity_;
+37 -13
Просмотреть файл
@@ -77,6 +77,11 @@ static constexpr uint16_t kBarrierPacketHeader =
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
static constexpr uint16_t kBarrierPacketAgentScopeHeader =
(HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE) | (1 << HSA_PACKET_HEADER_BARRIER) |
(HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
(HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
static constexpr uint16_t kNopPacketHeader =
(HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE) |
(HSA_FENCE_SCOPE_NONE << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
@@ -126,7 +131,7 @@ void Timestamp::checkGpuTime() {
}
// Avoid profiling data for the sync barrier, in tiny performance tests the first call
// to ROCr is very slow and that also affects the overall performance of the callback thread
if (command().GetBatchHead() == nullptr || command().profilingInfo().marker_ts_) {
if (command().GetBatchHead() == nullptr || command().profilingInfo().marker_ts_ > 0) {
hsa_amd_profiling_dispatch_time_t time = {};
if (it->engine_ == HwQueueEngine::Compute) {
hsa_amd_profiling_get_dispatch_time(gpu()->gpu_device(), it->signal_, &time);
@@ -442,7 +447,7 @@ hsa_signal_t VirtualGPU::HwQueueTracker::ActiveSignal(
// Update the current command/marker with HW event
prof_signal->retain();
ts->command().SetHwEvent(prof_signal);
} else if (ts->command().profilingInfo().marker_ts_ ) {
} else if (ts->command().profilingInfo().marker_ts_ > 0 ) {
// Update the current command/marker with HW event
prof_signal->retain();
ts->command().SetHwEvent(prof_signal);
@@ -803,11 +808,15 @@ bool VirtualGPU::dispatchGenericAqlPacket(
uint64_t index = hsa_queue_add_write_index_screlease(gpu_queue_, size);
uint64_t read = hsa_queue_load_read_index_relaxed(gpu_queue_);
auto cache_state = extractAqlBits(header, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE,
HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE);
if (timestamp_ != nullptr) {
// Pool size must grow to the size of pending AQL packets
const uint32_t pool_size = index - read;
// Get active signal for current dispatch if profiling is necessary
packet->completion_signal = Barriers().ActiveSignal(kInitSignalValueOne, timestamp_, pool_size);
packet->completion_signal = Barriers().ActiveSignal(kInitSignalValueOne, timestamp_,
pool_size);
}
// Make sure the slot is free for usage
@@ -845,8 +854,7 @@ bool VirtualGPU::dispatchGenericAqlPacket(
HSA_PACKET_HEADER_WIDTH_BARRIER),
extractAqlBits(header, HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE,
HSA_PACKET_HEADER_WIDTH_SCACQUIRE_FENCE_SCOPE),
extractAqlBits(header, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE,
HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE),
cache_state,
rest, reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->grid_size_x,
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->grid_size_y,
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->grid_size_z,
@@ -863,6 +871,8 @@ bool VirtualGPU::dispatchGenericAqlPacket(
//hsa_queue_store_write_index_release(gpu_queue_, index);
hsa_signal_store_screlease(gpu_queue_->doorbell_signal, index - 1);
roc_device_.SetCacheState(static_cast<Device::CacheState>(cache_state));
// Wait on signal ?
if (blocking) {
LogInfo("Runtime reachead the AQL queue limit. SW is much ahead of HW. Blocking AQL queue!");
@@ -957,6 +967,8 @@ void VirtualGPU::dispatchBarrierPacket(uint16_t packetHeader, bool skipSignal,
uint64_t index = hsa_queue_add_write_index_screlease(gpu_queue_, 1);
uint64_t read = hsa_queue_load_read_index_relaxed(gpu_queue_);
auto cache_state = extractAqlBits(packetHeader, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE,
HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE);
if (!skipSignal) {
// Pool size must grow to the size of pending AQL packets
const uint32_t pool_size = index - read;
@@ -987,11 +999,13 @@ void VirtualGPU::dispatchBarrierPacket(uint16_t packetHeader, bool skipSignal,
HSA_PACKET_HEADER_WIDTH_BARRIER),
extractAqlBits(packetHeader, HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE,
HSA_PACKET_HEADER_WIDTH_SCACQUIRE_FENCE_SCOPE),
extractAqlBits(packetHeader, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE,
HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE),
cache_state,
barrier_packet_.dep_signal[0], barrier_packet_.dep_signal[1],
barrier_packet_.dep_signal[2], barrier_packet_.dep_signal[3],
barrier_packet_.dep_signal[4], barrier_packet_.completion_signal);
roc_device_.SetCacheState(static_cast<Device::CacheState>(cache_state));
// Clear dependent signals for the next packet
barrier_packet_.dep_signal[0] = hsa_signal_t{};
barrier_packet_.dep_signal[1] = hsa_signal_t{};
@@ -2303,7 +2317,12 @@ void VirtualGPU::dispatchBarrierValuePacket(const hsa_amd_barrier_value_packet_t
unsigned int* headerPtr = reinterpret_cast<unsigned int*>(&header);
__atomic_store_n(reinterpret_cast<uint32_t*>(aql_loc), *headerPtr, __ATOMIC_RELEASE);
auto cache_state = extractAqlBits(header.header, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE,
HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE);
hsa_signal_store_screlease(gpu_queue_->doorbell_signal, index);
roc_device_.SetCacheState(static_cast<Device::CacheState>(cache_state));
ClPrint(amd::LOG_DEBUG, amd::LOG_AQL,
"HWq=0x%zx, BarrierValue Header = 0x%x AmdFormat = 0x%x ",
"(type=%d, barrier=%d, acquire=%d, release=%d), "
@@ -2313,8 +2332,7 @@ void VirtualGPU::dispatchBarrierValuePacket(const hsa_amd_barrier_value_packet_t
extractAqlBits(header.header, HSA_PACKET_HEADER_BARRIER, HSA_PACKET_HEADER_WIDTH_BARRIER),
extractAqlBits(header.header, HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE,
HSA_PACKET_HEADER_WIDTH_SCACQUIRE_FENCE_SCOPE),
extractAqlBits(header.header, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE,
HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE),
cache_state,
packet->completion_signal, packet->value, packet->mask, packet->cond,
HSA_SIGNAL_CONDITION_GTE, HSA_SIGNAL_CONDITION_EQ, HSA_SIGNAL_CONDITION_NE);
}
@@ -3096,10 +3114,16 @@ void VirtualGPU::submitMarker(amd::Marker& vcmd) {
} else {
profilingBegin(vcmd);
if (timestamp_ != nullptr) {
// Submit a barrier with a cache flushes.
dispatchBarrierPacket(kBarrierPacketHeader, false);
hasPendingDispatch_ = false;
uint32_t releaseFlags = vcmd.profilingInfo().marker_ts_;
if (ROC_EVENT_NO_FLUSH && releaseFlags == Device::CacheState::kCacheStateIgnore) {
dispatchBarrierPacket(kNopPacketHeader, false);
} else if (releaseFlags == Device::CacheState::kCacheStateAgent) {
dispatchBarrierPacket(kBarrierPacketAgentScopeHeader, false);
} else {
// Submit a barrier with a cache flushes.
dispatchBarrierPacket(kBarrierPacketHeader, false);
hasPendingDispatch_ = false;
}
}
profilingEnd(vcmd);
}
+15 -8
Просмотреть файл
@@ -55,8 +55,14 @@ Event::Event(HostQueue& queue)
}
// ================================================================================================
Event::Event() : callbacks_(NULL), status_(CL_SUBMITTED),
hw_event_(nullptr), notify_event_(nullptr), device_(nullptr) { notified_.clear(); }
Event::Event()
: callbacks_(NULL),
status_(CL_SUBMITTED),
hw_event_(nullptr),
notify_event_(nullptr),
device_(nullptr) {
notified_.clear();
}
// ================================================================================================
Event::~Event() {
@@ -241,7 +247,7 @@ bool Event::awaitCompletion() {
return false;
}
ClPrint(LOG_DEBUG, LOG_WAIT, "waiting for event %p to complete, current status %d",
ClPrint(LOG_DEBUG, LOG_WAIT, "Waiting for event %p to complete, current status %d",
this, status());
auto* queue = command().queue();
if ((queue != nullptr) && queue->vdev()->ActiveWait()) {
@@ -256,7 +262,7 @@ bool Event::awaitCompletion() {
lock_.wait();
}
}
ClPrint(LOG_DEBUG, LOG_WAIT, "event %p wait completed", this);
ClPrint(LOG_DEBUG, LOG_WAIT, "Event %p wait completed", this);
}
return status() == CL_COMPLETE;
@@ -277,7 +283,7 @@ bool Event::notifyCmdQueue(bool cpu_wait) {
notified_.clear();
return false;
}
ClPrint(LOG_DEBUG, LOG_CMD, "queue marker to command queue: %p", queue);
ClPrint(LOG_DEBUG, LOG_CMD, "Queue marker to command queue: %p", queue);
command->enqueue();
// Save notification, associated with the current event
notify_event_ = command;
@@ -290,7 +296,7 @@ bool Event::notifyCmdQueue(bool cpu_wait) {
notified_.clear();
return false;
}
ClPrint(LOG_DEBUG, LOG_CMD, "queue marker to command queue: %p", queue);
ClPrint(LOG_DEBUG, LOG_CMD, "Queue marker to command queue: %p", queue);
command->enqueue();
command->release();
}
@@ -336,7 +342,7 @@ void Command::enqueue() {
Agent::postEventCreate(as_cl(static_cast<Event*>(this)), type_);
}
ClPrint(LOG_DEBUG, LOG_CMD, "command is enqueued: %p", this);
ClPrint(LOG_DEBUG, LOG_CMD, "Command enqueued: %p", this);
// Direct dispatch logic below will submit the command immediately, but the command status
// update will occur later after flush() with a wait
@@ -360,7 +366,7 @@ void Command::enqueue() {
EnableProfiling();
}
if (isMarker && !profilingInfo().marker_ts_) {
if (isMarker && (profilingInfo().marker_ts_ == 0)) {
// Update batch head for the current marker. Hence the status of all commands can be
// updated upon the marker completion
SetBatchHead(queue_->GetSubmittionBatch());
@@ -414,6 +420,7 @@ NDRangeKernelCommand::NDRangeKernelCommand(HostQueue& queue, const EventWaitList
profilingInfo_.enabled_ = true;
profilingInfo_.clear();
profilingInfo_.callback_ = nullptr;
profilingInfo_.marker_ts_ = 1;
}
kernel_.retain();
}
+5 -4
Просмотреть файл
@@ -102,7 +102,7 @@ class Event : public RuntimeObject {
static const EventWaitList nullWaitList;
struct ProfilingInfo {
ProfilingInfo(bool enabled = false) : enabled_(enabled), waves_(0), marker_ts_(false) {
ProfilingInfo(bool enabled = false) : enabled_(enabled), waves_(0), marker_ts_(0) {
if (enabled) {
clear();
callback_ = nullptr;
@@ -113,10 +113,11 @@ class Event : public RuntimeObject {
uint64_t submitted_;
uint64_t start_;
uint64_t end_;
bool enabled_; //!< Profiling enabled for the wave limiter
uint32_t waves_; //!< The number of waves used in a dispatch
bool enabled_; //!< Profiling enabled for the wave limiter
uint32_t waves_; //!< The number of waves used in a dispatch
ProfilingCallback* callback_;
bool marker_ts_;
uint32_t marker_ts_; //!< Marker with release scope
//!< 5 - system scope, 3 - device scope, 1 - no scopes
void clear() {
queued_ = 0ULL;
submitted_ = 0ULL;
+5 -5
Просмотреть файл
@@ -106,20 +106,20 @@ bool HostQueue::terminate() {
void HostQueue::finish() {
Command* command = nullptr;
bool isCacheFlushed = device().IsCacheFlushed(Device::CacheState::kCacheStateSystem);
if (IS_HIP) {
command = getLastQueuedCommand(true);
// Check if the queue has nothing to process and return
if (AMD_DIRECT_DISPATCH && command == nullptr) {
if (AMD_DIRECT_DISPATCH && isCacheFlushed && command == nullptr) {
return;
}
}
if (nullptr == command) {
if (nullptr == command || !isCacheFlushed) {
// Send a finish to make sure we finished all commands
command = new Marker(*this, false);
if (command == NULL) {
return;
}
ClPrint(LOG_DEBUG, LOG_CMD, "marker is queued");
ClPrint(LOG_DEBUG, LOG_CMD, "Marker queued, Cache Flushed = %d", isCacheFlushed);
command->enqueue();
}
// Check HW status of the ROCcrl event. Note: not all ROCclr modes support HW status
@@ -194,7 +194,7 @@ void HostQueue::loop(device::VirtualDevice* virtualDevice) {
continue;
}
ClPrint(LOG_DEBUG, LOG_CMD, "command (%s) is submitted: %p", getOclCommandKindString(command->type()), command);
ClPrint(LOG_DEBUG, LOG_CMD, "Command (%s) submitted: %p", getOclCommandKindString(command->type()), command);
command->setStatus(CL_SUBMITTED);
+3 -1
Просмотреть файл
@@ -273,7 +273,9 @@ release(uint, ROC_AQL_QUEUE_SIZE, 4096, \
release(bool, ROC_SKIP_KERNEL_ARG_COPY, false, \
"If true, then runtime can skip kernel arg copy") \
release(bool, GPU_STREAMOPS_CP_WAIT, false, \
"Force the stream wait memory operation to wait on CP.")
"Force the stream wait memory operation to wait on CP.") \
release(bool, ROC_EVENT_NO_FLUSH, false, \
"Use NOP AQL packet for event records with no explicit flags.")
namespace amd {