2
0

SWDEV-257787 - Add engine tracking per signal

- The logic will trace compute, sdma read/write operations and
apply signals when necessary
- ROC_CPU_WAIT_FOR_SIGNAL, ROC_SYSTEM_SCOPE_SIGNAL
and ROC_SKIP_COPY_SYNC were added to control the tracking

Change-Id: I9e8e6174c63bf7784f7ab00964e2918c8667d364


[ROCm/clr commit: dbc7abaecf]
Este cometimento está contido em:
German Andryeyev
2021-01-22 16:42:35 -05:00
cometido por Saleel Kudchadker
ascendente 768a4383cd
cometimento f96e973378
8 ficheiros modificados com 312 adições e 159 eliminações
+49 -22
Ver ficheiro
@@ -433,18 +433,27 @@ bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& d
hsa_dim3_t offset = { 0, 0 ,0 };
if ((srcRect.rowPitch_ % 4 != 0) ||
(srcRect.slicePitch_ % 4 != 0) ||
(dstRect.rowPitch_ % 4 != 0) ||
if ((srcRect.rowPitch_ % 4 != 0) ||
(srcRect.slicePitch_ % 4 != 0) ||
(dstRect.rowPitch_ % 4 != 0) ||
(dstRect.slicePitch_ % 4 != 0)) {
isSubwindowRectCopy = false;
}
HwQueueEngine engine = HwQueueEngine::Unknown;
if ((srcAgent.handle == dev().getCpuAgent().handle) &&
(dstAgent.handle != dev().getCpuAgent().handle)) {
engine = HwQueueEngine::SdmaWrite;
} else if ((srcAgent.handle != dev().getCpuAgent().handle) &&
(dstAgent.handle == dev().getCpuAgent().handle)) {
engine = HwQueueEngine::SdmaRead;
}
hsa_signal_t* wait_event = gpu().Barriers().WaitingSignal(engine);
uint32_t num_wait_events = (wait_event == nullptr) ? 0 : 1;
if (isSubwindowRectCopy ) {
hsa_signal_t wait = gpu().Barriers().WaitSignal();
hsa_signal_t active = gpu().Barriers().ActiveSignal(kInitSignalValueOne, gpu().timestamp());
uint32_t num_wait_events = (wait.handle == 0) ? 0 : 1;
hsa_signal_t* wait_event = (wait.handle == 0) ? nullptr : &wait;
// Copy memory line by line
hsa_status_t status = hsa_amd_memory_async_copy_rect(&dstMem, &offset,
@@ -457,10 +466,7 @@ bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& d
} else {
// Fall to line by line copies
const hsa_signal_value_t kInitVal = size[2] * size[1];
hsa_signal_t wait = gpu().Barriers().WaitSignal();
hsa_signal_t active = gpu().Barriers().ActiveSignal(kInitVal, gpu().timestamp());
uint32_t num_wait_events = (wait.handle == 0) ? 0 : 1;
hsa_signal_t* wait_event = (wait.handle == 0) ? nullptr : &wait;
for (size_t z = 0; z < size[2]; ++z) {
for (size_t y = 0; y < size[1]; ++y) {
@@ -472,18 +478,18 @@ bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& d
(reinterpret_cast<address>(dst) + dstOffset), dstAgent,
(reinterpret_cast<const_address>(src) + srcOffset), srcAgent,
size[0], num_wait_events, wait_event, active);
gpu().setLastCommandSDMA(true) ;
if (status != HSA_STATUS_SUCCESS) {
gpu().Barriers().ResetCurrentSignal();
LogPrintfError("DMA buffer failed with code %d", status);
return false;
} else {
gpu().setLastCommandSDMA(true);
}
}
}
}
}
// Explicit wait for now, until runtime could distinguish compute and sdma operations
gpu().Barriers().WaitCurrent();
return true;
}
@@ -644,18 +650,24 @@ bool DmaBlitManager::hsaCopy(const Memory& srcMemory, const Memory& dstMemory,
srcAgent = dstAgent = dev().getBackendDevice();
}
hsa_signal_t wait = gpu().Barriers().WaitSignal();
hsa_signal_t active = gpu().Barriers().ActiveSignal(kInitSignalValueOne, gpu().timestamp());
uint32_t num_wait_events = (wait.handle == 0) ? 0 : 1;
hsa_signal_t* wait_event = (wait.handle == 0) ? nullptr : &wait;
HwQueueEngine engine = HwQueueEngine::Unknown;
if ((srcAgent.handle == dev().getCpuAgent().handle) &&
(dstAgent.handle != dev().getCpuAgent().handle)) {
engine = HwQueueEngine::SdmaWrite;
} else if ((srcAgent.handle != dev().getCpuAgent().handle) &&
(dstAgent.handle == dev().getCpuAgent().handle)) {
engine = HwQueueEngine::SdmaRead;
}
hsa_signal_t* wait_event = gpu().Barriers().WaitingSignal(engine);
uint32_t num_wait_events = (wait_event == nullptr) ? 0 : 1;
hsa_signal_t active = gpu().Barriers().ActiveSignal(kInitSignalValueOne, gpu().timestamp());
// Use SDMA to transfer the data
status = hsa_amd_memory_async_copy(dst, dstAgent, src, srcAgent,
size[0], num_wait_events, wait_event, active);
gpu().setLastCommandSDMA(true);
// Explicit wait for now, until runtime could distinguish compute and sdma operations
gpu().Barriers().WaitCurrent();
if (status == HSA_STATUS_SUCCESS) {
gpu().setLastCommandSDMA(true);
gpu().addSystemScope();
} else {
gpu().Barriers().ResetCurrentSignal();
@@ -690,7 +702,6 @@ bool DmaBlitManager::hsaCopyStaged(const_address hostSrc, address hostDst, size_
// Allocate requested size of memory
while (totalSize > 0) {
size = std::min(totalSize, dev().settings().stagedXferSize_);
hsa_signal_t active = gpu().Barriers().ActiveSignal(kInitSignalValueOne, gpu().timestamp());
// Copy data from Host to Device
if (hostToDev) {
@@ -700,14 +711,22 @@ bool DmaBlitManager::hsaCopyStaged(const_address hostSrc, address hostDst, size_
const hsa_agent_t srcAgent =
(size <= dev().settings().sdmaCopyThreshold_) ? dev().getBackendDevice() : dev().getCpuAgent();
HwQueueEngine engine = HwQueueEngine::Unknown;
if (srcAgent.handle == dev().getBackendDevice().handle) {
engine = HwQueueEngine::SdmaWrite;
}
gpu().Barriers().SetActiveEngine(engine);
hsa_signal_t active = gpu().Barriers().ActiveSignal(kInitSignalValueOne, gpu().timestamp());
memcpy(hsaBuffer, hostSrc + offset, size);
status = hsa_amd_memory_async_copy(hostDst + offset, dev().getBackendDevice(), hsaBuffer,
srcAgent, size, 0, nullptr, active);
gpu().setLastCommandSDMA(true);
if (status != HSA_STATUS_SUCCESS) {
gpu().Barriers().ResetCurrentSignal();
LogPrintfError("Hsa copy from host to device failed with code %d", status);
return false;
} else {
gpu().setLastCommandSDMA(true);
}
gpu().Barriers().WaitCurrent();
totalSize -= size;
@@ -721,14 +740,22 @@ bool DmaBlitManager::hsaCopyStaged(const_address hostSrc, address hostDst, size_
const hsa_agent_t dstAgent =
(size <= dev().settings().sdmaCopyThreshold_) ? dev().getBackendDevice() : dev().getCpuAgent();
HwQueueEngine engine = HwQueueEngine::Unknown;
if (dstAgent.handle == dev().getBackendDevice().handle) {
engine = HwQueueEngine::SdmaRead;
}
gpu().Barriers().SetActiveEngine(engine);
hsa_signal_t active = gpu().Barriers().ActiveSignal(kInitSignalValueOne, gpu().timestamp());
// Copy data from Device to Host
status = hsa_amd_memory_async_copy(hsaBuffer, dstAgent, hostSrc + offset,
dev().getBackendDevice(), size, 0, nullptr, active);
gpu().setLastCommandSDMA(true);
if (status == HSA_STATUS_SUCCESS) {
gpu().setLastCommandSDMA(true);
gpu().Barriers().WaitCurrent();
memcpy(hostDst + offset, hsaBuffer, size);
} else {
gpu().Barriers().ResetCurrentSignal();
LogPrintfError("Hsa copy from device to host failed with code %d", status);
return false;
}
-3
Ver ficheiro
@@ -40,9 +40,6 @@ class Kernel;
class Memory;
class VirtualGPU;
constexpr bool kSkipCpuWait = true;
constexpr bool kIgnoreBarrier = false;
//! DMA Blit Manager
class DmaBlitManager : public device::HostBlitManager {
public:
+5 -1
Ver ficheiro
@@ -33,11 +33,15 @@ static constexpr uint DeviceQueueMaskSize = 32;
//! Set to match the number of pipes, which is 8.
static constexpr uint kMaxAsyncQueues = 8;
constexpr bool kSkipCpuWait = true;
constexpr bool kIgnoreBarrier = false;
enum HwQueueEngine : uint32_t {
Compute = 0,
SdmaRead = 1,
SdmaWrite = 2,
Unknown = 3
Unknown = 3,
External = 4
};
} // namespace roc
+9
Ver ficheiro
@@ -28,6 +28,7 @@
namespace roc {
// ================================================================================================
Settings::Settings() {
// Initialize the HSA device default settings
@@ -91,8 +92,15 @@ Settings::Settings() {
rocr_backend_ = true;
barrier_sync_ = (!flagIsDefault(ROC_BARRIER_SYNC)) ? ROC_BARRIER_SYNC : true;
cpu_wait_for_signal_ = !AMD_DIRECT_DISPATCH;
cpu_wait_for_signal_ = (!flagIsDefault(ROC_CPU_WAIT_FOR_SIGNAL)) ?
ROC_CPU_WAIT_FOR_SIGNAL : cpu_wait_for_signal_;
system_scope_signal_ = ROC_SYSTEM_SCOPE_SIGNAL;
skip_copy_sync_ = ROC_SKIP_COPY_SYNC;
}
// ================================================================================================
bool Settings::create(bool fullProfile, uint32_t gfxipMajor, uint32_t gfxipMinor, bool enableXNACK,
bool coop_groups) {
customHostAllocator_ = false;
@@ -169,6 +177,7 @@ bool Settings::create(bool fullProfile, uint32_t gfxipMajor, uint32_t gfxipMinor
return true;
}
// ================================================================================================
void Settings::override() {
// Limit reported workgroup size
if (GPU_MAX_WORKGROUP_SIZE != 0) {
+4 -1
Ver ficheiro
@@ -52,7 +52,10 @@ class Settings : public device::Settings {
uint stagedXferWrite_ : 1; //!< Uses a staged buffer write
uint imageBufferWar_ : 1; //!< Image buffer workaround for Gfx10
uint barrier_sync_ : 1; //!< Use AQL barrier command to sync with CPU
uint reserved_ : 23;
uint cpu_wait_for_signal_ : 1; //!< Wait for HSA signal on CPU
uint system_scope_signal_ : 1; //!< HSA signal is visibile to the entire system
uint skip_copy_sync_ : 1; //!< Ignore explicit HSA signal waits for copy functionality
uint reserved_ : 20;
};
uint value_;
};
+206 -11
Ver ficheiro
@@ -35,10 +35,12 @@
#include "amd_hsa_kernel_code.h"
#include <fstream>
#include <vector>
#include <string>
#include <limits>
#include <memory>
#include <string>
#include <thread>
#include <vector>
/**
* HSA image object size in bytes (see HSAIL spec)
@@ -100,6 +102,37 @@ static unsigned extractAqlBits(unsigned v, unsigned pos, unsigned width) {
return (v >> pos) & ((1 << width) - 1);
};
// ================================================================================================
void Timestamp::checkGpuTime() {
if (HwProfiling()) {
uint64_t start = std::numeric_limits<uint64_t>::max();
uint64_t end = 0;
for (auto it : signals_) {
if (hsa_signal_load_relaxed(it->signal_) > 0) {
WaitForSignal(it->signal_);
}
hsa_amd_profiling_dispatch_time_t time = {};
if (it->engine_ == HwQueueEngine::Compute) {
hsa_amd_profiling_get_dispatch_time(agent_, it->signal_, &time);
} else {
hsa_amd_profiling_async_copy_time_t time_sdma = {};
hsa_amd_profiling_get_async_copy_time(it->signal_, &time_sdma);
time.start = time_sdma.start;
time.end = time_sdma.end;
}
start = std::min(time.start, start);
end = std::max(time.end, end);
it->ts_ = nullptr;
it->done_ = true;
}
signals_.clear();
start_ = start * ticksToTime_;
end_ = end * ticksToTime_;
}
}
// ================================================================================================
bool VirtualGPU::MemoryDependency::create(size_t numMemObj) {
if (numMemObj > 0) {
// Allocate the array of memory objects for dependency tracking
@@ -114,6 +147,7 @@ bool VirtualGPU::MemoryDependency::create(size_t numMemObj) {
return true;
}
// ================================================================================================
void VirtualGPU::MemoryDependency::validate(VirtualGPU& gpu, const Memory* memory, bool readOnly) {
bool flushL1Cache = false;
@@ -170,6 +204,7 @@ void VirtualGPU::MemoryDependency::validate(VirtualGPU& gpu, const Memory* memor
numMemObjectsInQueue_++;
}
// ================================================================================================
void VirtualGPU::MemoryDependency::clear(bool all) {
if (numMemObjectsInQueue_ > 0) {
size_t i, j;
@@ -205,6 +240,143 @@ void VirtualGPU::MemoryDependency::clear(bool all) {
}
}
// ================================================================================================
VirtualGPU::HwQueueTracker::~HwQueueTracker() {
for (auto& signal: signal_list_) {
if (signal->signal_.handle != 0) {
hsa_signal_destroy(signal->signal_);
}
delete signal;
}
}
// ================================================================================================
bool VirtualGPU::HwQueueTracker::Create() {
constexpr size_t kSignalListSize = 16;
signal_list_.resize(kSignalListSize);
hsa_agent_t agent = gpu_.gpu_device();
const Settings& settings = gpu_.dev().settings();
hsa_agent_t* agents = (settings.system_scope_signal_) ? nullptr : &agent;
uint32_t num_agents = (settings.system_scope_signal_) ? 0 : 1;
for (uint i = 0; i < kSignalListSize; ++i) {
std::unique_ptr<ProfilingSignal> signal(new ProfilingSignal());
if ((signal == nullptr) ||
(HSA_STATUS_SUCCESS != hsa_signal_create(0, num_agents, agents, &signal->signal_))) {
return false;
}
signal_list_[i] = signal.release();
}
return true;
}
// ================================================================================================
hsa_signal_t VirtualGPU::HwQueueTracker::ActiveSignal(
hsa_signal_value_t init_val, Timestamp* ts, uint32_t queue_size) {
// If queue size grows, then add more signals to avoid more frequent stalls
if (queue_size > signal_list_.size()) {
std::unique_ptr<ProfilingSignal> signal(new ProfilingSignal());
if (signal != nullptr) {
hsa_agent_t agent = gpu_.gpu_device();
const Settings& settings = gpu_.dev().settings();
hsa_agent_t* agents = (settings.system_scope_signal_) ? nullptr : &agent;
uint32_t num_agents = (settings.system_scope_signal_) ? 0 : 1;
if (HSA_STATUS_SUCCESS == hsa_signal_create(0, num_agents, agents, &signal->signal_)) {
signal_list_.push_back(signal.release());
}
}
}
// Find valid index
++current_id_ %= signal_list_.size();
// Make sure the previous operation on the current signal is done
WaitCurrent();
// Have to wait the next signal in the queue to avoid a race condition between
// a GPU waiter(which may be not triggered yet) and CPU signal reset below
WaitNext();
// Reset the signal and return
hsa_signal_silent_store_relaxed(signal_list_[current_id_]->signal_, init_val);
signal_list_[current_id_]->done_ = false;
signal_list_[current_id_]->engine_ = engine_;
if (ts != 0) {
if (!sdma_profiling_) {
hsa_amd_profiling_async_copy_enable(true);
sdma_profiling_ = true;
}
signal_list_[current_id_]->ts_ = ts;
ts->AddProfilingSignal(signal_list_[current_id_]);
}
return signal_list_[current_id_]->signal_;
}
// ================================================================================================
hsa_signal_t* VirtualGPU::HwQueueTracker::WaitingSignal(HwQueueEngine engine) {
bool explicit_wait = false;
hsa_signal_t* signal = nullptr;
// Does runtime switch the active engine?
if (engine != engine_) {
// Yes, return the signla from the previous operation for a wait
engine_ = engine;
explicit_wait = true;
} else {
// Unknown engine in use, hence return a wait signal always
if (engine == HwQueueEngine::Unknown) {
explicit_wait = true;
} else {
// Check if skip wait optimizaiton is enabled. It will try to predice the same engine in ROCr
// and ignore signal wait, relying on in-order engine execution
const Settings& settings = gpu_.dev().settings();
if (!settings.skip_copy_sync_ && (engine != HwQueueEngine::Compute)) {
explicit_wait = true;
}
}
}
// Check if a wait is required
if (explicit_wait) {
ProfilingSignal* prof_signal;
// Check if there is an external signal
if (external_signal_ != nullptr) {
prof_signal = external_signal_;
external_signal_ = nullptr;
} else {
prof_signal = signal_list_[current_id_];
}
// Early signal status check
if (hsa_signal_load_relaxed(prof_signal->signal_) > 0) {
const Settings& settings = gpu_.dev().settings();
// Wait on CPU if requested
if (settings.cpu_wait_for_signal_) {
CpuWaitForSignal(prof_signal);
} else {
return &prof_signal->signal_;
}
}
}
return signal;
}
// ================================================================================================
bool VirtualGPU::HwQueueTracker::CpuWaitForSignal(ProfilingSignal* signal) {
// Wait for the current signal
if (!signal->done_) {
// Update timestamp values if requested
if (signal->ts_ != nullptr) {
signal->ts_->checkGpuTime();
} else {
if (!WaitForSignal(signal->signal_)) {
LogPrintfError("Failed signal [0x%lx] wait", signal->signal_);
return false;
}
signal->done_ = true;
}
}
return true;
}
// ================================================================================================
void VirtualGPU::HwQueueTracker::ResetCurrentSignal() {
// Reset the signal and return
@@ -537,6 +709,16 @@ bool VirtualGPU::dispatchGenericAqlPacket(
// ================================================================================================
bool VirtualGPU::dispatchAqlPacket(
hsa_kernel_dispatch_packet_t* packet, uint16_t header, uint16_t rest, bool blocking) {
hsa_signal_t* wait = Barriers().WaitingSignal();
// AQL dispatch doesn't support dependent signals and extra barrier packet must be generated
if (wait != nullptr) {
barrier_packet_.dep_signal[0] = *wait;
constexpr bool kSkipSignal = true;
dispatchBarrierPacket(&barrier_packet_, kNopPacketHeader, kSkipSignal);
} else {
barrier_packet_.dep_signal[0] = hsa_signal_t{};
}
return dispatchGenericAqlPacket(packet, header, rest, blocking);
}
@@ -587,6 +769,9 @@ void VirtualGPU::dispatchBarrierPacket(hsa_barrier_and_packet_t* packet,
if (!skipSignal) {
// Pool size must grow to the size of pending AQL packets
const uint32_t pool_size = index - read;
hsa_signal_t* wait = Barriers().WaitingSignal();
packet->dep_signal[0] = (wait != nullptr) ? *wait : hsa_signal_t{};
// Get active signal for current dispatch if profiling is necessary
packet->completion_signal = Barriers().ActiveSignal(kInitSignalValueOne, timestamp_,
pool_size);
@@ -663,6 +848,7 @@ VirtualGPU::VirtualGPU(Device& device, bool profiling, bool cooperative,
schedulerParam_(nullptr),
schedulerQueue_(nullptr),
schedulerSignal_({0}),
barriers_(*this),
cuMask_(cuMask),
priority_(priority),
copy_command_type_(0)
@@ -804,7 +990,7 @@ bool VirtualGPU::create() {
}
// Allocate signal tracker for ROCr copy queue
if (!Barriers().Create(gpu_device())) {
if (!Barriers().Create()) {
LogError("Could not create signal for copy queue!");
return false;
}
@@ -867,7 +1053,7 @@ void VirtualGPU::profilingBegin(amd::Command& command, bool drmProfiling) {
return;
}
// Without barrier profiling will wait for each individual signal
timestamp_ = new Timestamp();
timestamp_ = new Timestamp(dev().getBackendDevice());
timestamp_->start();
}
}
@@ -1193,10 +1379,9 @@ void VirtualGPU::submitSvmPrefetchAsync(amd::SvmPrefetchAsyncCommand& cmd) {
#if AMD_HMM_SUPPORT
profilingBegin(cmd);
// Initialize signal for the barrier
hsa_signal_t wait = Barriers().WaitSignal();
hsa_signal_t active = Barriers().ActiveSignal(kInitSignalValueOne, timestamp_);
uint32_t num_wait_events = (wait.handle == 0) ? 0 : 1;
hsa_signal_t* wait_event = (wait.handle == 0) ? nullptr : &wait;
hsa_signal_t* wait_event = Barriers().WaitingSignal(HwQueueEngine::Unknown);
hsa_signal_t active = Barriers().ActiveSignal(kInitSignalValueOne, timestamp_);
uint32_t num_wait_events = (wait_event == nullptr) ? 0 : 1;
// Find the requested agent for the transfer
hsa_agent_t agent = (cmd.cpu_access() ||
@@ -1207,7 +1392,7 @@ void VirtualGPU::submitSvmPrefetchAsync(amd::SvmPrefetchAsyncCommand& cmd) {
hsa_status_t status = hsa_amd_svm_prefetch_async(
const_cast<void*>(cmd.dev_ptr()), cmd.count(), agent, num_wait_events, wait_event, active);
// Wait for the prefetch. Should skip wait, but may require extra tracking for kernel execution.
// Wait for the prefetch. Should skip wait, but may require extra tracking for kernel execution
if ((status != HSA_STATUS_SUCCESS) || !Barriers().WaitCurrent()) {
Barriers().ResetCurrentSignal();
LogError("hsa_amd_svm_prefetch_async failed");
@@ -2376,6 +2561,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
}
return true;
}
/**
* @brief Api to dispatch a kernel for execution. The implementation
* parses the input object, an instance of virtual command to obtain
@@ -2385,10 +2571,11 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
* It also parses the kernel arguments buffer to inject into Hsa Runtime
* the list of kernel parameters.
*/
// ================================================================================================
void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) {
if (vcmd.cooperativeGroups() || vcmd.cooperativeMultiDeviceGroups()) {
// Wait for the execution on the current queue, since the coop groups will use the device queue
releaseGpuMemoryFence();
releaseGpuMemoryFence(kIgnoreBarrier, kSkipCpuWait);
// Get device queue for exclusive GPU access
VirtualGPU* queue = dev().xferQueue();
@@ -2398,6 +2585,9 @@ void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) {
queue->profilingBegin(vcmd);
// Add a dependency into the device queue on the current queue
queue->Barriers().SetExternalSignal(Barriers().GetLastSignal());
if (vcmd.cooperativeGroups()) {
// Initialize GWS if it's cooperative groups launch
uint32_t workgroups = 0;
@@ -2420,7 +2610,11 @@ void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) {
vcmd.setStatus(CL_INVALID_OPERATION);
}
// Wait for the execution on the device queue. Keep the current queue in-order
queue->releaseGpuMemoryFence();
queue->releaseGpuMemoryFence(kIgnoreBarrier, kSkipCpuWait);
// Add a dependency into the current queue on the coop queue
Barriers().SetExternalSignal(queue->Barriers().GetLastSignal());
hasPendingDispatch_ = true;
queue->profilingEnd(vcmd);
} else {
@@ -2440,6 +2634,7 @@ void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) {
}
}
// ================================================================================================
void VirtualGPU::submitNativeFn(amd::NativeFnCommand& cmd) {
// std::cout<<__FUNCTION__<<" not implemented"<<"*********"<<std::endl;
}
+33 -121
Ver ficheiro
@@ -92,46 +92,17 @@ class Timestamp {
void AddProfilingSignal(ProfilingSignal* signal) { signals_.push_back(signal); }
const bool HwProfiling() const { return (signals_.size() > 0) ? true : false; }
const bool HwProfiling() const { return !signals_.empty(); }
void setAgent(hsa_agent_t agent) { agent_ = agent; }
Timestamp()
Timestamp(hsa_agent_t agent)
: start_(std::numeric_limits<uint64_t>::max())
, end_(0) {
agent_.handle = 0;
}
, end_(0)
, agent_(agent) {}
~Timestamp() {}
//! Finds execution ticks on GPU
void checkGpuTime() {
if (HwProfiling()) {
hsa_amd_profiling_dispatch_time_t time = {};
uint64_t start = std::numeric_limits<uint64_t>::max();
uint64_t end = 0;
for (auto it : signals_) {
if (hsa_signal_load_relaxed(it->signal_) > 0) {
WaitForSignal(it->signal_);
}
hsa_amd_profiling_get_dispatch_time(agent_, it->signal_, &time);
if ((time.end - time.start) == 0) {
hsa_amd_profiling_async_copy_time_t time_sdma = {};
hsa_amd_profiling_get_async_copy_time(it->signal_, &time_sdma);
time.start = time_sdma.start;
time.end = time_sdma.end;
}
start = std::min(time.start, start);
end = std::max(time.end, end);
it->ts_ = nullptr;
it->done_ = true;
}
signals_.clear();
start_ = start * ticksToTime_;
end_ = end * ticksToTime_;
}
}
void checkGpuTime();
// Start a timestamp (get timestamp from OS)
void start() { start_ = amd::Os::timeNanos(); }
@@ -183,113 +154,54 @@ class VirtualGPU : public device::VirtualDevice {
class HwQueueTracker : public amd::EmbeddedObject {
public:
HwQueueTracker() {}
HwQueueTracker(const VirtualGPU& gpu): gpu_(gpu) {}
~HwQueueTracker() {
for (auto& signal: signal_list_) {
if (signal->signal_.handle != 0) {
hsa_signal_destroy(signal->signal_);
}
delete signal;
}
}
~HwQueueTracker();
//! Creates a pool of signals for tracking of HW operations on the queue
bool Create(hsa_agent_t agent) {
constexpr size_t kSignalListSize = 16;
signal_list_.resize(kSignalListSize);
for (uint i = 0; i < kSignalListSize; ++i) {
ProfilingSignal* signal = new ProfilingSignal();
if ((signal == nullptr) || (HSA_STATUS_SUCCESS != hsa_signal_create(
0, 1, &agent, &signal->signal_))) {
return false;
}
signal_list_[i] = signal;
}
agent_ = agent;
return true;
}
bool Create();
//! Finds a free signal for the upcomming operation
hsa_signal_t ActiveSignal(hsa_signal_value_t init_val = kInitSignalValueOne,
Timestamp* ts = nullptr, uint32_t queue_size = 0) {
// If queue size grows, then add more signals to avoid more frequent stalls
if (queue_size > signal_list_.size()) {
ProfilingSignal* signal = new ProfilingSignal();
if (signal != nullptr) {
if (HSA_STATUS_SUCCESS == hsa_signal_create(
0, 1, &agent_, &signal->signal_)) {
signal_list_.push_back(signal);
}
}
}
// Find valid index
++current_id_ %= signal_list_.size();
// Make sure the previous operation on the current signal is done
WaitCurrent();
// Have to wait the next signal in the queue to avoid a race condition between
// a GPU waiter(which may be not triggered yet) and CPU signal reset below
WaitNext();
// Reset the signal and return
hsa_signal_silent_store_relaxed(signal_list_[current_id_]->signal_, init_val);
signal_list_[current_id_]->done_ = false;
if (ts != 0) {
if (!sdma_profiling_) {
hsa_amd_profiling_async_copy_enable(true);
sdma_profiling_ = true;
}
signal_list_[current_id_]->ts_ = ts;
ts->AddProfilingSignal(signal_list_[current_id_]);
ts->setAgent(agent_);
}
return signal_list_[current_id_]->signal_;
}
Timestamp* ts = nullptr, uint32_t queue_size = 0);
//! Wait for the curent active signal. Can idle the queue
bool WaitCurrent() { return WaitIndex(current_id_); }
bool WaitCurrent() { return CpuWaitForSignal(signal_list_[current_id_]); }
//! Update current active engine
void SetActiveEngine(HwQueueEngine engine = HwQueueEngine::Compute) { engine_ = engine; }
//! Returns the last submitted signal for a wait
hsa_signal_t WaitSignal() {
//! @note Currently wait on CPU unconditionally to avoid a negative performance impact
WaitCurrent();
return hsa_signal_t{};
}
hsa_signal_t* WaitingSignal(HwQueueEngine engine = HwQueueEngine::Compute);
//! Resets current signal back to the previous one. It's necessary in a case of ROCr failure.
void ResetCurrentSignal();
private:
//! Inserts an external signal(submission in another queue) for dependency tracking
void SetExternalSignal(ProfilingSignal* signal) {
external_signal_ = signal;
engine_ = HwQueueEngine::External;
}
//! Inserts an external signal(submission in another queue) for dependency tracking
ProfilingSignal* GetLastSignal() const { return signal_list_[current_id_]; }
private:
//! Wait for the next active signal
void WaitNext() {
size_t next = (current_id_ + 1) % signal_list_.size();
WaitIndex(next);
CpuWaitForSignal(signal_list_[next]);
}
//! Wait for the provided signal
bool WaitIndex(size_t index) {
// Wait for the current signal
if (!signal_list_[index]->done_) {
// Update timestamp values if requested
if (signal_list_[index]->ts_ != nullptr) {
signal_list_[index]->ts_->checkGpuTime();
} else {
if (!WaitForSignal(signal_list_[index]->signal_)) {
LogPrintfError("Failed signal [0x%lx] wait", signal_list_[index]->signal_);
return false;
}
signal_list_[index]->done_ = true;
}
}
return true;
}
bool CpuWaitForSignal(ProfilingSignal* signal);
std::vector<ProfilingSignal*> signal_list_; //!< The pool of all signals for processing
size_t current_id_ = 0; //!< Last submitted signal
hsa_agent_t agent_; //!< HSA device agent
bool sdma_profiling_ = false; //!< Don't enable SDMA profiling by default
HwQueueEngine engine_ = HwQueueEngine::Unknown; //!< Engine used in the current operations
std::vector<ProfilingSignal*> signal_list_; //!< The pool of all signals for processing
ProfilingSignal* external_signal_ = nullptr; //!< Dependency on external signal
size_t current_id_ = 0; //!< Last submitted signal
bool sdma_profiling_ = false; //!< If TRUE, then SDMA profiling is enabled
const VirtualGPU& gpu_; //!< VirtualGPU, associated with this tracker
};
VirtualGPU(Device& device, bool profiling = false, bool cooperative = false,
@@ -358,7 +270,7 @@ class VirtualGPU : public device::VirtualDevice {
*/
bool releaseGpuMemoryFence(bool force_barrier = false, bool skip_copy_wait = false);
hsa_agent_t gpu_device() { return gpu_device_; }
hsa_agent_t gpu_device() const { return gpu_device_; }
hsa_queue_t* gpu_queue() { return gpu_queue_; }
// Return pointer to PrintfDbg
+6
Ver ficheiro
@@ -244,6 +244,12 @@ release(bool, ROC_ACTIVE_WAIT, false, \
"Forces unconditional active wait for GPU") \
release(bool, ROC_ENABLE_LARGE_BAR, true, \
"Enable Large Bar if supported by the device") \
release(bool, ROC_CPU_WAIT_FOR_SIGNAL, true, \
"Enable CPU wait for dependent HSA signals.") \
release(bool, ROC_SYSTEM_SCOPE_SIGNAL, true, \
"Enable system scope for signals (uses interrupts).") \
release(bool, ROC_SKIP_COPY_SYNC, false, \
"Skips copy syncs if runtime can predict the same engine.") \
release(bool, HIP_FORCE_QUEUE_PROFILING, false, \
"Force command queue profiling by default") \
release(uint, PAL_FORCE_ASIC_REVISION, 0, \