SWDEV-257787 - Add engine tracking per signal
- The logic will trace compute, sdma read/write operations and
apply signals when necessary
- ROC_CPU_WAIT_FOR_SIGNAL, ROC_SYSTEM_SCOPE_SIGNAL
and ROC_SKIP_COPY_SYNC were added to control the tracking
Change-Id: I9e8e6174c63bf7784f7ab00964e2918c8667d364
[ROCm/clr commit: dbc7abaecf]
Este cometimento está contido em:
cometido por
Saleel Kudchadker
ascendente
768a4383cd
cometimento
f96e973378
@@ -433,18 +433,27 @@ bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& d
|
||||
hsa_dim3_t offset = { 0, 0 ,0 };
|
||||
|
||||
|
||||
if ((srcRect.rowPitch_ % 4 != 0) ||
|
||||
(srcRect.slicePitch_ % 4 != 0) ||
|
||||
(dstRect.rowPitch_ % 4 != 0) ||
|
||||
if ((srcRect.rowPitch_ % 4 != 0) ||
|
||||
(srcRect.slicePitch_ % 4 != 0) ||
|
||||
(dstRect.rowPitch_ % 4 != 0) ||
|
||||
(dstRect.slicePitch_ % 4 != 0)) {
|
||||
isSubwindowRectCopy = false;
|
||||
}
|
||||
|
||||
HwQueueEngine engine = HwQueueEngine::Unknown;
|
||||
if ((srcAgent.handle == dev().getCpuAgent().handle) &&
|
||||
(dstAgent.handle != dev().getCpuAgent().handle)) {
|
||||
engine = HwQueueEngine::SdmaWrite;
|
||||
} else if ((srcAgent.handle != dev().getCpuAgent().handle) &&
|
||||
(dstAgent.handle == dev().getCpuAgent().handle)) {
|
||||
engine = HwQueueEngine::SdmaRead;
|
||||
}
|
||||
|
||||
hsa_signal_t* wait_event = gpu().Barriers().WaitingSignal(engine);
|
||||
uint32_t num_wait_events = (wait_event == nullptr) ? 0 : 1;
|
||||
|
||||
if (isSubwindowRectCopy ) {
|
||||
hsa_signal_t wait = gpu().Barriers().WaitSignal();
|
||||
hsa_signal_t active = gpu().Barriers().ActiveSignal(kInitSignalValueOne, gpu().timestamp());
|
||||
uint32_t num_wait_events = (wait.handle == 0) ? 0 : 1;
|
||||
hsa_signal_t* wait_event = (wait.handle == 0) ? nullptr : &wait;
|
||||
|
||||
// Copy memory line by line
|
||||
hsa_status_t status = hsa_amd_memory_async_copy_rect(&dstMem, &offset,
|
||||
@@ -457,10 +466,7 @@ bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& d
|
||||
} else {
|
||||
// Fall to line by line copies
|
||||
const hsa_signal_value_t kInitVal = size[2] * size[1];
|
||||
hsa_signal_t wait = gpu().Barriers().WaitSignal();
|
||||
hsa_signal_t active = gpu().Barriers().ActiveSignal(kInitVal, gpu().timestamp());
|
||||
uint32_t num_wait_events = (wait.handle == 0) ? 0 : 1;
|
||||
hsa_signal_t* wait_event = (wait.handle == 0) ? nullptr : &wait;
|
||||
|
||||
for (size_t z = 0; z < size[2]; ++z) {
|
||||
for (size_t y = 0; y < size[1]; ++y) {
|
||||
@@ -472,18 +478,18 @@ bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& d
|
||||
(reinterpret_cast<address>(dst) + dstOffset), dstAgent,
|
||||
(reinterpret_cast<const_address>(src) + srcOffset), srcAgent,
|
||||
size[0], num_wait_events, wait_event, active);
|
||||
gpu().setLastCommandSDMA(true) ;
|
||||
if (status != HSA_STATUS_SUCCESS) {
|
||||
gpu().Barriers().ResetCurrentSignal();
|
||||
LogPrintfError("DMA buffer failed with code %d", status);
|
||||
return false;
|
||||
} else {
|
||||
gpu().setLastCommandSDMA(true);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// Explicit wait for now, until runtime could distinguish compute and sdma operations
|
||||
gpu().Barriers().WaitCurrent();
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -644,18 +650,24 @@ bool DmaBlitManager::hsaCopy(const Memory& srcMemory, const Memory& dstMemory,
|
||||
srcAgent = dstAgent = dev().getBackendDevice();
|
||||
}
|
||||
|
||||
hsa_signal_t wait = gpu().Barriers().WaitSignal();
|
||||
hsa_signal_t active = gpu().Barriers().ActiveSignal(kInitSignalValueOne, gpu().timestamp());
|
||||
uint32_t num_wait_events = (wait.handle == 0) ? 0 : 1;
|
||||
hsa_signal_t* wait_event = (wait.handle == 0) ? nullptr : &wait;
|
||||
HwQueueEngine engine = HwQueueEngine::Unknown;
|
||||
if ((srcAgent.handle == dev().getCpuAgent().handle) &&
|
||||
(dstAgent.handle != dev().getCpuAgent().handle)) {
|
||||
engine = HwQueueEngine::SdmaWrite;
|
||||
} else if ((srcAgent.handle != dev().getCpuAgent().handle) &&
|
||||
(dstAgent.handle == dev().getCpuAgent().handle)) {
|
||||
engine = HwQueueEngine::SdmaRead;
|
||||
}
|
||||
|
||||
hsa_signal_t* wait_event = gpu().Barriers().WaitingSignal(engine);
|
||||
uint32_t num_wait_events = (wait_event == nullptr) ? 0 : 1;
|
||||
hsa_signal_t active = gpu().Barriers().ActiveSignal(kInitSignalValueOne, gpu().timestamp());
|
||||
|
||||
// Use SDMA to transfer the data
|
||||
status = hsa_amd_memory_async_copy(dst, dstAgent, src, srcAgent,
|
||||
size[0], num_wait_events, wait_event, active);
|
||||
gpu().setLastCommandSDMA(true);
|
||||
// Explicit wait for now, until runtime could distinguish compute and sdma operations
|
||||
gpu().Barriers().WaitCurrent();
|
||||
if (status == HSA_STATUS_SUCCESS) {
|
||||
gpu().setLastCommandSDMA(true);
|
||||
gpu().addSystemScope();
|
||||
} else {
|
||||
gpu().Barriers().ResetCurrentSignal();
|
||||
@@ -690,7 +702,6 @@ bool DmaBlitManager::hsaCopyStaged(const_address hostSrc, address hostDst, size_
|
||||
// Allocate requested size of memory
|
||||
while (totalSize > 0) {
|
||||
size = std::min(totalSize, dev().settings().stagedXferSize_);
|
||||
hsa_signal_t active = gpu().Barriers().ActiveSignal(kInitSignalValueOne, gpu().timestamp());
|
||||
|
||||
// Copy data from Host to Device
|
||||
if (hostToDev) {
|
||||
@@ -700,14 +711,22 @@ bool DmaBlitManager::hsaCopyStaged(const_address hostSrc, address hostDst, size_
|
||||
const hsa_agent_t srcAgent =
|
||||
(size <= dev().settings().sdmaCopyThreshold_) ? dev().getBackendDevice() : dev().getCpuAgent();
|
||||
|
||||
HwQueueEngine engine = HwQueueEngine::Unknown;
|
||||
if (srcAgent.handle == dev().getBackendDevice().handle) {
|
||||
engine = HwQueueEngine::SdmaWrite;
|
||||
}
|
||||
gpu().Barriers().SetActiveEngine(engine);
|
||||
hsa_signal_t active = gpu().Barriers().ActiveSignal(kInitSignalValueOne, gpu().timestamp());
|
||||
|
||||
memcpy(hsaBuffer, hostSrc + offset, size);
|
||||
status = hsa_amd_memory_async_copy(hostDst + offset, dev().getBackendDevice(), hsaBuffer,
|
||||
srcAgent, size, 0, nullptr, active);
|
||||
gpu().setLastCommandSDMA(true);
|
||||
if (status != HSA_STATUS_SUCCESS) {
|
||||
gpu().Barriers().ResetCurrentSignal();
|
||||
LogPrintfError("Hsa copy from host to device failed with code %d", status);
|
||||
return false;
|
||||
} else {
|
||||
gpu().setLastCommandSDMA(true);
|
||||
}
|
||||
gpu().Barriers().WaitCurrent();
|
||||
totalSize -= size;
|
||||
@@ -721,14 +740,22 @@ bool DmaBlitManager::hsaCopyStaged(const_address hostSrc, address hostDst, size_
|
||||
const hsa_agent_t dstAgent =
|
||||
(size <= dev().settings().sdmaCopyThreshold_) ? dev().getBackendDevice() : dev().getCpuAgent();
|
||||
|
||||
HwQueueEngine engine = HwQueueEngine::Unknown;
|
||||
if (dstAgent.handle == dev().getBackendDevice().handle) {
|
||||
engine = HwQueueEngine::SdmaRead;
|
||||
}
|
||||
gpu().Barriers().SetActiveEngine(engine);
|
||||
hsa_signal_t active = gpu().Barriers().ActiveSignal(kInitSignalValueOne, gpu().timestamp());
|
||||
|
||||
// Copy data from Device to Host
|
||||
status = hsa_amd_memory_async_copy(hsaBuffer, dstAgent, hostSrc + offset,
|
||||
dev().getBackendDevice(), size, 0, nullptr, active);
|
||||
gpu().setLastCommandSDMA(true);
|
||||
if (status == HSA_STATUS_SUCCESS) {
|
||||
gpu().setLastCommandSDMA(true);
|
||||
gpu().Barriers().WaitCurrent();
|
||||
memcpy(hostDst + offset, hsaBuffer, size);
|
||||
} else {
|
||||
gpu().Barriers().ResetCurrentSignal();
|
||||
LogPrintfError("Hsa copy from device to host failed with code %d", status);
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -40,9 +40,6 @@ class Kernel;
|
||||
class Memory;
|
||||
class VirtualGPU;
|
||||
|
||||
constexpr bool kSkipCpuWait = true;
|
||||
constexpr bool kIgnoreBarrier = false;
|
||||
|
||||
//! DMA Blit Manager
|
||||
class DmaBlitManager : public device::HostBlitManager {
|
||||
public:
|
||||
|
||||
@@ -33,11 +33,15 @@ static constexpr uint DeviceQueueMaskSize = 32;
|
||||
//! Set to match the number of pipes, which is 8.
|
||||
static constexpr uint kMaxAsyncQueues = 8;
|
||||
|
||||
constexpr bool kSkipCpuWait = true;
|
||||
constexpr bool kIgnoreBarrier = false;
|
||||
|
||||
enum HwQueueEngine : uint32_t {
|
||||
Compute = 0,
|
||||
SdmaRead = 1,
|
||||
SdmaWrite = 2,
|
||||
Unknown = 3
|
||||
Unknown = 3,
|
||||
External = 4
|
||||
};
|
||||
|
||||
} // namespace roc
|
||||
|
||||
@@ -28,6 +28,7 @@
|
||||
|
||||
namespace roc {
|
||||
|
||||
// ================================================================================================
|
||||
Settings::Settings() {
|
||||
// Initialize the HSA device default settings
|
||||
|
||||
@@ -91,8 +92,15 @@ Settings::Settings() {
|
||||
|
||||
rocr_backend_ = true;
|
||||
barrier_sync_ = (!flagIsDefault(ROC_BARRIER_SYNC)) ? ROC_BARRIER_SYNC : true;
|
||||
|
||||
cpu_wait_for_signal_ = !AMD_DIRECT_DISPATCH;
|
||||
cpu_wait_for_signal_ = (!flagIsDefault(ROC_CPU_WAIT_FOR_SIGNAL)) ?
|
||||
ROC_CPU_WAIT_FOR_SIGNAL : cpu_wait_for_signal_;
|
||||
system_scope_signal_ = ROC_SYSTEM_SCOPE_SIGNAL;
|
||||
skip_copy_sync_ = ROC_SKIP_COPY_SYNC;
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
bool Settings::create(bool fullProfile, uint32_t gfxipMajor, uint32_t gfxipMinor, bool enableXNACK,
|
||||
bool coop_groups) {
|
||||
customHostAllocator_ = false;
|
||||
@@ -169,6 +177,7 @@ bool Settings::create(bool fullProfile, uint32_t gfxipMajor, uint32_t gfxipMinor
|
||||
return true;
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
void Settings::override() {
|
||||
// Limit reported workgroup size
|
||||
if (GPU_MAX_WORKGROUP_SIZE != 0) {
|
||||
|
||||
@@ -52,7 +52,10 @@ class Settings : public device::Settings {
|
||||
uint stagedXferWrite_ : 1; //!< Uses a staged buffer write
|
||||
uint imageBufferWar_ : 1; //!< Image buffer workaround for Gfx10
|
||||
uint barrier_sync_ : 1; //!< Use AQL barrier command to sync with CPU
|
||||
uint reserved_ : 23;
|
||||
uint cpu_wait_for_signal_ : 1; //!< Wait for HSA signal on CPU
|
||||
uint system_scope_signal_ : 1; //!< HSA signal is visibile to the entire system
|
||||
uint skip_copy_sync_ : 1; //!< Ignore explicit HSA signal waits for copy functionality
|
||||
uint reserved_ : 20;
|
||||
};
|
||||
uint value_;
|
||||
};
|
||||
|
||||
@@ -35,10 +35,12 @@
|
||||
#include "amd_hsa_kernel_code.h"
|
||||
|
||||
#include <fstream>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <limits>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
|
||||
|
||||
/**
|
||||
* HSA image object size in bytes (see HSAIL spec)
|
||||
@@ -100,6 +102,37 @@ static unsigned extractAqlBits(unsigned v, unsigned pos, unsigned width) {
|
||||
return (v >> pos) & ((1 << width) - 1);
|
||||
};
|
||||
|
||||
// ================================================================================================
|
||||
void Timestamp::checkGpuTime() {
|
||||
if (HwProfiling()) {
|
||||
uint64_t start = std::numeric_limits<uint64_t>::max();
|
||||
uint64_t end = 0;
|
||||
|
||||
for (auto it : signals_) {
|
||||
if (hsa_signal_load_relaxed(it->signal_) > 0) {
|
||||
WaitForSignal(it->signal_);
|
||||
}
|
||||
hsa_amd_profiling_dispatch_time_t time = {};
|
||||
if (it->engine_ == HwQueueEngine::Compute) {
|
||||
hsa_amd_profiling_get_dispatch_time(agent_, it->signal_, &time);
|
||||
} else {
|
||||
hsa_amd_profiling_async_copy_time_t time_sdma = {};
|
||||
hsa_amd_profiling_get_async_copy_time(it->signal_, &time_sdma);
|
||||
time.start = time_sdma.start;
|
||||
time.end = time_sdma.end;
|
||||
}
|
||||
start = std::min(time.start, start);
|
||||
end = std::max(time.end, end);
|
||||
it->ts_ = nullptr;
|
||||
it->done_ = true;
|
||||
}
|
||||
signals_.clear();
|
||||
start_ = start * ticksToTime_;
|
||||
end_ = end * ticksToTime_;
|
||||
}
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
bool VirtualGPU::MemoryDependency::create(size_t numMemObj) {
|
||||
if (numMemObj > 0) {
|
||||
// Allocate the array of memory objects for dependency tracking
|
||||
@@ -114,6 +147,7 @@ bool VirtualGPU::MemoryDependency::create(size_t numMemObj) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
void VirtualGPU::MemoryDependency::validate(VirtualGPU& gpu, const Memory* memory, bool readOnly) {
|
||||
bool flushL1Cache = false;
|
||||
|
||||
@@ -170,6 +204,7 @@ void VirtualGPU::MemoryDependency::validate(VirtualGPU& gpu, const Memory* memor
|
||||
numMemObjectsInQueue_++;
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
void VirtualGPU::MemoryDependency::clear(bool all) {
|
||||
if (numMemObjectsInQueue_ > 0) {
|
||||
size_t i, j;
|
||||
@@ -205,6 +240,143 @@ void VirtualGPU::MemoryDependency::clear(bool all) {
|
||||
}
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
VirtualGPU::HwQueueTracker::~HwQueueTracker() {
|
||||
for (auto& signal: signal_list_) {
|
||||
if (signal->signal_.handle != 0) {
|
||||
hsa_signal_destroy(signal->signal_);
|
||||
}
|
||||
delete signal;
|
||||
}
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
bool VirtualGPU::HwQueueTracker::Create() {
|
||||
constexpr size_t kSignalListSize = 16;
|
||||
signal_list_.resize(kSignalListSize);
|
||||
|
||||
hsa_agent_t agent = gpu_.gpu_device();
|
||||
const Settings& settings = gpu_.dev().settings();
|
||||
hsa_agent_t* agents = (settings.system_scope_signal_) ? nullptr : &agent;
|
||||
uint32_t num_agents = (settings.system_scope_signal_) ? 0 : 1;
|
||||
|
||||
for (uint i = 0; i < kSignalListSize; ++i) {
|
||||
std::unique_ptr<ProfilingSignal> signal(new ProfilingSignal());
|
||||
if ((signal == nullptr) ||
|
||||
(HSA_STATUS_SUCCESS != hsa_signal_create(0, num_agents, agents, &signal->signal_))) {
|
||||
return false;
|
||||
}
|
||||
signal_list_[i] = signal.release();
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
hsa_signal_t VirtualGPU::HwQueueTracker::ActiveSignal(
|
||||
hsa_signal_value_t init_val, Timestamp* ts, uint32_t queue_size) {
|
||||
// If queue size grows, then add more signals to avoid more frequent stalls
|
||||
if (queue_size > signal_list_.size()) {
|
||||
std::unique_ptr<ProfilingSignal> signal(new ProfilingSignal());
|
||||
if (signal != nullptr) {
|
||||
hsa_agent_t agent = gpu_.gpu_device();
|
||||
const Settings& settings = gpu_.dev().settings();
|
||||
hsa_agent_t* agents = (settings.system_scope_signal_) ? nullptr : &agent;
|
||||
uint32_t num_agents = (settings.system_scope_signal_) ? 0 : 1;
|
||||
|
||||
if (HSA_STATUS_SUCCESS == hsa_signal_create(0, num_agents, agents, &signal->signal_)) {
|
||||
signal_list_.push_back(signal.release());
|
||||
}
|
||||
}
|
||||
}
|
||||
// Find valid index
|
||||
++current_id_ %= signal_list_.size();
|
||||
|
||||
// Make sure the previous operation on the current signal is done
|
||||
WaitCurrent();
|
||||
|
||||
// Have to wait the next signal in the queue to avoid a race condition between
|
||||
// a GPU waiter(which may be not triggered yet) and CPU signal reset below
|
||||
WaitNext();
|
||||
|
||||
// Reset the signal and return
|
||||
hsa_signal_silent_store_relaxed(signal_list_[current_id_]->signal_, init_val);
|
||||
signal_list_[current_id_]->done_ = false;
|
||||
signal_list_[current_id_]->engine_ = engine_;
|
||||
if (ts != 0) {
|
||||
if (!sdma_profiling_) {
|
||||
hsa_amd_profiling_async_copy_enable(true);
|
||||
sdma_profiling_ = true;
|
||||
}
|
||||
signal_list_[current_id_]->ts_ = ts;
|
||||
ts->AddProfilingSignal(signal_list_[current_id_]);
|
||||
}
|
||||
return signal_list_[current_id_]->signal_;
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
hsa_signal_t* VirtualGPU::HwQueueTracker::WaitingSignal(HwQueueEngine engine) {
|
||||
bool explicit_wait = false;
|
||||
hsa_signal_t* signal = nullptr;
|
||||
// Does runtime switch the active engine?
|
||||
if (engine != engine_) {
|
||||
// Yes, return the signla from the previous operation for a wait
|
||||
engine_ = engine;
|
||||
explicit_wait = true;
|
||||
} else {
|
||||
// Unknown engine in use, hence return a wait signal always
|
||||
if (engine == HwQueueEngine::Unknown) {
|
||||
explicit_wait = true;
|
||||
} else {
|
||||
// Check if skip wait optimizaiton is enabled. It will try to predice the same engine in ROCr
|
||||
// and ignore signal wait, relying on in-order engine execution
|
||||
const Settings& settings = gpu_.dev().settings();
|
||||
if (!settings.skip_copy_sync_ && (engine != HwQueueEngine::Compute)) {
|
||||
explicit_wait = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
// Check if a wait is required
|
||||
if (explicit_wait) {
|
||||
ProfilingSignal* prof_signal;
|
||||
// Check if there is an external signal
|
||||
if (external_signal_ != nullptr) {
|
||||
prof_signal = external_signal_;
|
||||
external_signal_ = nullptr;
|
||||
} else {
|
||||
prof_signal = signal_list_[current_id_];
|
||||
}
|
||||
// Early signal status check
|
||||
if (hsa_signal_load_relaxed(prof_signal->signal_) > 0) {
|
||||
const Settings& settings = gpu_.dev().settings();
|
||||
// Wait on CPU if requested
|
||||
if (settings.cpu_wait_for_signal_) {
|
||||
CpuWaitForSignal(prof_signal);
|
||||
} else {
|
||||
return &prof_signal->signal_;
|
||||
}
|
||||
}
|
||||
}
|
||||
return signal;
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
bool VirtualGPU::HwQueueTracker::CpuWaitForSignal(ProfilingSignal* signal) {
|
||||
// Wait for the current signal
|
||||
if (!signal->done_) {
|
||||
// Update timestamp values if requested
|
||||
if (signal->ts_ != nullptr) {
|
||||
signal->ts_->checkGpuTime();
|
||||
} else {
|
||||
if (!WaitForSignal(signal->signal_)) {
|
||||
LogPrintfError("Failed signal [0x%lx] wait", signal->signal_);
|
||||
return false;
|
||||
}
|
||||
signal->done_ = true;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
void VirtualGPU::HwQueueTracker::ResetCurrentSignal() {
|
||||
// Reset the signal and return
|
||||
@@ -537,6 +709,16 @@ bool VirtualGPU::dispatchGenericAqlPacket(
|
||||
// ================================================================================================
|
||||
bool VirtualGPU::dispatchAqlPacket(
|
||||
hsa_kernel_dispatch_packet_t* packet, uint16_t header, uint16_t rest, bool blocking) {
|
||||
hsa_signal_t* wait = Barriers().WaitingSignal();
|
||||
// AQL dispatch doesn't support dependent signals and extra barrier packet must be generated
|
||||
if (wait != nullptr) {
|
||||
barrier_packet_.dep_signal[0] = *wait;
|
||||
constexpr bool kSkipSignal = true;
|
||||
dispatchBarrierPacket(&barrier_packet_, kNopPacketHeader, kSkipSignal);
|
||||
} else {
|
||||
barrier_packet_.dep_signal[0] = hsa_signal_t{};
|
||||
}
|
||||
|
||||
return dispatchGenericAqlPacket(packet, header, rest, blocking);
|
||||
}
|
||||
|
||||
@@ -587,6 +769,9 @@ void VirtualGPU::dispatchBarrierPacket(hsa_barrier_and_packet_t* packet,
|
||||
if (!skipSignal) {
|
||||
// Pool size must grow to the size of pending AQL packets
|
||||
const uint32_t pool_size = index - read;
|
||||
hsa_signal_t* wait = Barriers().WaitingSignal();
|
||||
packet->dep_signal[0] = (wait != nullptr) ? *wait : hsa_signal_t{};
|
||||
|
||||
// Get active signal for current dispatch if profiling is necessary
|
||||
packet->completion_signal = Barriers().ActiveSignal(kInitSignalValueOne, timestamp_,
|
||||
pool_size);
|
||||
@@ -663,6 +848,7 @@ VirtualGPU::VirtualGPU(Device& device, bool profiling, bool cooperative,
|
||||
schedulerParam_(nullptr),
|
||||
schedulerQueue_(nullptr),
|
||||
schedulerSignal_({0}),
|
||||
barriers_(*this),
|
||||
cuMask_(cuMask),
|
||||
priority_(priority),
|
||||
copy_command_type_(0)
|
||||
@@ -804,7 +990,7 @@ bool VirtualGPU::create() {
|
||||
}
|
||||
|
||||
// Allocate signal tracker for ROCr copy queue
|
||||
if (!Barriers().Create(gpu_device())) {
|
||||
if (!Barriers().Create()) {
|
||||
LogError("Could not create signal for copy queue!");
|
||||
return false;
|
||||
}
|
||||
@@ -867,7 +1053,7 @@ void VirtualGPU::profilingBegin(amd::Command& command, bool drmProfiling) {
|
||||
return;
|
||||
}
|
||||
// Without barrier profiling will wait for each individual signal
|
||||
timestamp_ = new Timestamp();
|
||||
timestamp_ = new Timestamp(dev().getBackendDevice());
|
||||
timestamp_->start();
|
||||
}
|
||||
}
|
||||
@@ -1193,10 +1379,9 @@ void VirtualGPU::submitSvmPrefetchAsync(amd::SvmPrefetchAsyncCommand& cmd) {
|
||||
#if AMD_HMM_SUPPORT
|
||||
profilingBegin(cmd);
|
||||
// Initialize signal for the barrier
|
||||
hsa_signal_t wait = Barriers().WaitSignal();
|
||||
hsa_signal_t active = Barriers().ActiveSignal(kInitSignalValueOne, timestamp_);
|
||||
uint32_t num_wait_events = (wait.handle == 0) ? 0 : 1;
|
||||
hsa_signal_t* wait_event = (wait.handle == 0) ? nullptr : &wait;
|
||||
hsa_signal_t* wait_event = Barriers().WaitingSignal(HwQueueEngine::Unknown);
|
||||
hsa_signal_t active = Barriers().ActiveSignal(kInitSignalValueOne, timestamp_);
|
||||
uint32_t num_wait_events = (wait_event == nullptr) ? 0 : 1;
|
||||
|
||||
// Find the requested agent for the transfer
|
||||
hsa_agent_t agent = (cmd.cpu_access() ||
|
||||
@@ -1207,7 +1392,7 @@ void VirtualGPU::submitSvmPrefetchAsync(amd::SvmPrefetchAsyncCommand& cmd) {
|
||||
hsa_status_t status = hsa_amd_svm_prefetch_async(
|
||||
const_cast<void*>(cmd.dev_ptr()), cmd.count(), agent, num_wait_events, wait_event, active);
|
||||
|
||||
// Wait for the prefetch. Should skip wait, but may require extra tracking for kernel execution.
|
||||
// Wait for the prefetch. Should skip wait, but may require extra tracking for kernel execution
|
||||
if ((status != HSA_STATUS_SUCCESS) || !Barriers().WaitCurrent()) {
|
||||
Barriers().ResetCurrentSignal();
|
||||
LogError("hsa_amd_svm_prefetch_async failed");
|
||||
@@ -2376,6 +2561,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Api to dispatch a kernel for execution. The implementation
|
||||
* parses the input object, an instance of virtual command to obtain
|
||||
@@ -2385,10 +2571,11 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
|
||||
* It also parses the kernel arguments buffer to inject into Hsa Runtime
|
||||
* the list of kernel parameters.
|
||||
*/
|
||||
// ================================================================================================
|
||||
void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) {
|
||||
if (vcmd.cooperativeGroups() || vcmd.cooperativeMultiDeviceGroups()) {
|
||||
// Wait for the execution on the current queue, since the coop groups will use the device queue
|
||||
releaseGpuMemoryFence();
|
||||
releaseGpuMemoryFence(kIgnoreBarrier, kSkipCpuWait);
|
||||
|
||||
// Get device queue for exclusive GPU access
|
||||
VirtualGPU* queue = dev().xferQueue();
|
||||
@@ -2398,6 +2585,9 @@ void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) {
|
||||
|
||||
queue->profilingBegin(vcmd);
|
||||
|
||||
// Add a dependency into the device queue on the current queue
|
||||
queue->Barriers().SetExternalSignal(Barriers().GetLastSignal());
|
||||
|
||||
if (vcmd.cooperativeGroups()) {
|
||||
// Initialize GWS if it's cooperative groups launch
|
||||
uint32_t workgroups = 0;
|
||||
@@ -2420,7 +2610,11 @@ void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) {
|
||||
vcmd.setStatus(CL_INVALID_OPERATION);
|
||||
}
|
||||
// Wait for the execution on the device queue. Keep the current queue in-order
|
||||
queue->releaseGpuMemoryFence();
|
||||
queue->releaseGpuMemoryFence(kIgnoreBarrier, kSkipCpuWait);
|
||||
|
||||
// Add a dependency into the current queue on the coop queue
|
||||
Barriers().SetExternalSignal(queue->Barriers().GetLastSignal());
|
||||
hasPendingDispatch_ = true;
|
||||
|
||||
queue->profilingEnd(vcmd);
|
||||
} else {
|
||||
@@ -2440,6 +2634,7 @@ void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) {
|
||||
}
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
void VirtualGPU::submitNativeFn(amd::NativeFnCommand& cmd) {
|
||||
// std::cout<<__FUNCTION__<<" not implemented"<<"*********"<<std::endl;
|
||||
}
|
||||
|
||||
@@ -92,46 +92,17 @@ class Timestamp {
|
||||
|
||||
void AddProfilingSignal(ProfilingSignal* signal) { signals_.push_back(signal); }
|
||||
|
||||
const bool HwProfiling() const { return (signals_.size() > 0) ? true : false; }
|
||||
const bool HwProfiling() const { return !signals_.empty(); }
|
||||
|
||||
void setAgent(hsa_agent_t agent) { agent_ = agent; }
|
||||
|
||||
Timestamp()
|
||||
Timestamp(hsa_agent_t agent)
|
||||
: start_(std::numeric_limits<uint64_t>::max())
|
||||
, end_(0) {
|
||||
agent_.handle = 0;
|
||||
}
|
||||
, end_(0)
|
||||
, agent_(agent) {}
|
||||
|
||||
~Timestamp() {}
|
||||
|
||||
//! Finds execution ticks on GPU
|
||||
void checkGpuTime() {
|
||||
if (HwProfiling()) {
|
||||
hsa_amd_profiling_dispatch_time_t time = {};
|
||||
|
||||
uint64_t start = std::numeric_limits<uint64_t>::max();
|
||||
uint64_t end = 0;
|
||||
for (auto it : signals_) {
|
||||
if (hsa_signal_load_relaxed(it->signal_) > 0) {
|
||||
WaitForSignal(it->signal_);
|
||||
}
|
||||
hsa_amd_profiling_get_dispatch_time(agent_, it->signal_, &time);
|
||||
if ((time.end - time.start) == 0) {
|
||||
hsa_amd_profiling_async_copy_time_t time_sdma = {};
|
||||
hsa_amd_profiling_get_async_copy_time(it->signal_, &time_sdma);
|
||||
time.start = time_sdma.start;
|
||||
time.end = time_sdma.end;
|
||||
}
|
||||
start = std::min(time.start, start);
|
||||
end = std::max(time.end, end);
|
||||
it->ts_ = nullptr;
|
||||
it->done_ = true;
|
||||
}
|
||||
signals_.clear();
|
||||
start_ = start * ticksToTime_;
|
||||
end_ = end * ticksToTime_;
|
||||
}
|
||||
}
|
||||
void checkGpuTime();
|
||||
|
||||
// Start a timestamp (get timestamp from OS)
|
||||
void start() { start_ = amd::Os::timeNanos(); }
|
||||
@@ -183,113 +154,54 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
|
||||
class HwQueueTracker : public amd::EmbeddedObject {
|
||||
public:
|
||||
HwQueueTracker() {}
|
||||
HwQueueTracker(const VirtualGPU& gpu): gpu_(gpu) {}
|
||||
|
||||
~HwQueueTracker() {
|
||||
for (auto& signal: signal_list_) {
|
||||
if (signal->signal_.handle != 0) {
|
||||
hsa_signal_destroy(signal->signal_);
|
||||
}
|
||||
delete signal;
|
||||
}
|
||||
}
|
||||
~HwQueueTracker();
|
||||
|
||||
//! Creates a pool of signals for tracking of HW operations on the queue
|
||||
bool Create(hsa_agent_t agent) {
|
||||
constexpr size_t kSignalListSize = 16;
|
||||
signal_list_.resize(kSignalListSize);
|
||||
for (uint i = 0; i < kSignalListSize; ++i) {
|
||||
ProfilingSignal* signal = new ProfilingSignal();
|
||||
if ((signal == nullptr) || (HSA_STATUS_SUCCESS != hsa_signal_create(
|
||||
0, 1, &agent, &signal->signal_))) {
|
||||
return false;
|
||||
}
|
||||
signal_list_[i] = signal;
|
||||
}
|
||||
agent_ = agent;
|
||||
return true;
|
||||
}
|
||||
bool Create();
|
||||
|
||||
//! Finds a free signal for the upcomming operation
|
||||
hsa_signal_t ActiveSignal(hsa_signal_value_t init_val = kInitSignalValueOne,
|
||||
Timestamp* ts = nullptr, uint32_t queue_size = 0) {
|
||||
// If queue size grows, then add more signals to avoid more frequent stalls
|
||||
if (queue_size > signal_list_.size()) {
|
||||
ProfilingSignal* signal = new ProfilingSignal();
|
||||
if (signal != nullptr) {
|
||||
if (HSA_STATUS_SUCCESS == hsa_signal_create(
|
||||
0, 1, &agent_, &signal->signal_)) {
|
||||
signal_list_.push_back(signal);
|
||||
}
|
||||
}
|
||||
}
|
||||
// Find valid index
|
||||
++current_id_ %= signal_list_.size();
|
||||
|
||||
// Make sure the previous operation on the current signal is done
|
||||
WaitCurrent();
|
||||
|
||||
// Have to wait the next signal in the queue to avoid a race condition between
|
||||
// a GPU waiter(which may be not triggered yet) and CPU signal reset below
|
||||
WaitNext();
|
||||
|
||||
// Reset the signal and return
|
||||
hsa_signal_silent_store_relaxed(signal_list_[current_id_]->signal_, init_val);
|
||||
signal_list_[current_id_]->done_ = false;
|
||||
if (ts != 0) {
|
||||
if (!sdma_profiling_) {
|
||||
hsa_amd_profiling_async_copy_enable(true);
|
||||
sdma_profiling_ = true;
|
||||
}
|
||||
signal_list_[current_id_]->ts_ = ts;
|
||||
ts->AddProfilingSignal(signal_list_[current_id_]);
|
||||
ts->setAgent(agent_);
|
||||
}
|
||||
return signal_list_[current_id_]->signal_;
|
||||
}
|
||||
Timestamp* ts = nullptr, uint32_t queue_size = 0);
|
||||
|
||||
//! Wait for the curent active signal. Can idle the queue
|
||||
bool WaitCurrent() { return WaitIndex(current_id_); }
|
||||
bool WaitCurrent() { return CpuWaitForSignal(signal_list_[current_id_]); }
|
||||
|
||||
//! Update current active engine
|
||||
void SetActiveEngine(HwQueueEngine engine = HwQueueEngine::Compute) { engine_ = engine; }
|
||||
|
||||
//! Returns the last submitted signal for a wait
|
||||
hsa_signal_t WaitSignal() {
|
||||
//! @note Currently wait on CPU unconditionally to avoid a negative performance impact
|
||||
WaitCurrent();
|
||||
return hsa_signal_t{};
|
||||
}
|
||||
hsa_signal_t* WaitingSignal(HwQueueEngine engine = HwQueueEngine::Compute);
|
||||
|
||||
//! Resets current signal back to the previous one. It's necessary in a case of ROCr failure.
|
||||
void ResetCurrentSignal();
|
||||
|
||||
private:
|
||||
//! Inserts an external signal(submission in another queue) for dependency tracking
|
||||
void SetExternalSignal(ProfilingSignal* signal) {
|
||||
external_signal_ = signal;
|
||||
engine_ = HwQueueEngine::External;
|
||||
}
|
||||
|
||||
//! Inserts an external signal(submission in another queue) for dependency tracking
|
||||
ProfilingSignal* GetLastSignal() const { return signal_list_[current_id_]; }
|
||||
|
||||
private:
|
||||
//! Wait for the next active signal
|
||||
void WaitNext() {
|
||||
size_t next = (current_id_ + 1) % signal_list_.size();
|
||||
WaitIndex(next);
|
||||
CpuWaitForSignal(signal_list_[next]);
|
||||
}
|
||||
|
||||
//! Wait for the provided signal
|
||||
bool WaitIndex(size_t index) {
|
||||
// Wait for the current signal
|
||||
if (!signal_list_[index]->done_) {
|
||||
// Update timestamp values if requested
|
||||
if (signal_list_[index]->ts_ != nullptr) {
|
||||
signal_list_[index]->ts_->checkGpuTime();
|
||||
} else {
|
||||
if (!WaitForSignal(signal_list_[index]->signal_)) {
|
||||
LogPrintfError("Failed signal [0x%lx] wait", signal_list_[index]->signal_);
|
||||
return false;
|
||||
}
|
||||
signal_list_[index]->done_ = true;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
bool CpuWaitForSignal(ProfilingSignal* signal);
|
||||
|
||||
std::vector<ProfilingSignal*> signal_list_; //!< The pool of all signals for processing
|
||||
size_t current_id_ = 0; //!< Last submitted signal
|
||||
hsa_agent_t agent_; //!< HSA device agent
|
||||
bool sdma_profiling_ = false; //!< Don't enable SDMA profiling by default
|
||||
HwQueueEngine engine_ = HwQueueEngine::Unknown; //!< Engine used in the current operations
|
||||
std::vector<ProfilingSignal*> signal_list_; //!< The pool of all signals for processing
|
||||
ProfilingSignal* external_signal_ = nullptr; //!< Dependency on external signal
|
||||
size_t current_id_ = 0; //!< Last submitted signal
|
||||
bool sdma_profiling_ = false; //!< If TRUE, then SDMA profiling is enabled
|
||||
const VirtualGPU& gpu_; //!< VirtualGPU, associated with this tracker
|
||||
};
|
||||
|
||||
VirtualGPU(Device& device, bool profiling = false, bool cooperative = false,
|
||||
@@ -358,7 +270,7 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
*/
|
||||
bool releaseGpuMemoryFence(bool force_barrier = false, bool skip_copy_wait = false);
|
||||
|
||||
hsa_agent_t gpu_device() { return gpu_device_; }
|
||||
hsa_agent_t gpu_device() const { return gpu_device_; }
|
||||
hsa_queue_t* gpu_queue() { return gpu_queue_; }
|
||||
|
||||
// Return pointer to PrintfDbg
|
||||
|
||||
@@ -244,6 +244,12 @@ release(bool, ROC_ACTIVE_WAIT, false, \
|
||||
"Forces unconditional active wait for GPU") \
|
||||
release(bool, ROC_ENABLE_LARGE_BAR, true, \
|
||||
"Enable Large Bar if supported by the device") \
|
||||
release(bool, ROC_CPU_WAIT_FOR_SIGNAL, true, \
|
||||
"Enable CPU wait for dependent HSA signals.") \
|
||||
release(bool, ROC_SYSTEM_SCOPE_SIGNAL, true, \
|
||||
"Enable system scope for signals (uses interrupts).") \
|
||||
release(bool, ROC_SKIP_COPY_SYNC, false, \
|
||||
"Skips copy syncs if runtime can predict the same engine.") \
|
||||
release(bool, HIP_FORCE_QUEUE_PROFILING, false, \
|
||||
"Force command queue profiling by default") \
|
||||
release(uint, PAL_FORCE_ASIC_REVISION, 0, \
|
||||
|
||||
Criar uma nova questão referindo esta
Bloquear um utilizador