Add HSA signal global tracking logic.

Implement the global class for signals tracking per device queue.
Switch to the new tracking mechanism.

Change-Id: I3c4dda04b34e6d18d6a95510d84102909633b415


[ROCm/clr commit: 8698aeef0d]
This commit is contained in:
German Andryeyev
2021-01-07 16:41:30 -05:00
parent a962e2d0b3
commit 30cf81fc93
6 changed files with 284 additions and 289 deletions
+78 -94
View File
@@ -62,12 +62,14 @@ bool DmaBlitManager::readMemoryStaged(Memory& srcMemory, void* dstHost, Memory&
bool DmaBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost,
const amd::Coord3D& origin, const amd::Coord3D& size,
bool entire) const {
// HSA copy functionality with a possible async operaiton, hence make sure GPU is done
gpu().releaseGpuMemoryFence();
// HSA copy functionality with a possible async operation
gpu().releaseGpuMemoryFence(kIgnoreBarrier, kSkipCpuWait);
// Use host copy if memory has direct access
if (setup_.disableReadBuffer_ ||
(srcMemory.isHostMemDirectAccess() && !srcMemory.isCpuUncached())) {
// Stall GPU before CPU access
gpu().Barriers().WaitCurrent();
return HostBlitManager::readBuffer(srcMemory, dstHost, origin, size, entire);
} else {
size_t srcSize = size[0];
@@ -149,12 +151,14 @@ bool DmaBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost,
bool DmaBlitManager::readBufferRect(device::Memory& srcMemory, void* dstHost,
const amd::BufferRect& bufRect, const amd::BufferRect& hostRect,
const amd::Coord3D& size, bool entire) const {
// HSA copy functionality with a possible async operaiton, hence make sure GPU is done
// HSA copy functionality with a possible async operation
gpu().releaseGpuMemoryFence();
// Use host copy if memory has direct access
if (setup_.disableReadBufferRect_ ||
(srcMemory.isHostMemDirectAccess() && !srcMemory.isCpuUncached())) {
// Stall GPU before CPU access
gpu().Barriers().WaitCurrent();
return HostBlitManager::readBufferRect(srcMemory, dstHost, bufRect, hostRect, size, entire);
} else {
Memory& xferBuf = dev().xferRead().acquire();
@@ -187,7 +191,7 @@ bool DmaBlitManager::readBufferRect(device::Memory& srcMemory, void* dstHost,
bool DmaBlitManager::readImage(device::Memory& srcMemory, void* dstHost, const amd::Coord3D& origin,
const amd::Coord3D& size, size_t rowPitch, size_t slicePitch,
bool entire) const {
// HSA copy functionality with a possible async operaiton, hence make sure GPU is done
// HSA copy functionality with a possible async operation
gpu().releaseGpuMemoryFence();
if (setup_.disableReadImage_) {
@@ -219,14 +223,16 @@ bool DmaBlitManager::writeMemoryStaged(const void* srcHost, Memory& dstMemory, M
bool DmaBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemory,
const amd::Coord3D& origin, const amd::Coord3D& size,
bool entire) const {
// HSA copy functionality with a possible async operaiton, hence make sure GPU is done
gpu().releaseGpuMemoryFence();
// Use host copy if memory has direct access
if (setup_.disableWriteBuffer_ || dstMemory.isHostMemDirectAccess() ||
gpuMem(dstMemory).IsPersistentDirectMap()) {
// Stall GPU before CPU access
gpu().releaseGpuMemoryFence();
return HostBlitManager::writeBuffer(srcHost, dstMemory, origin, size, entire);
} else {
// HSA copy functionality with a possible async operation
gpu().releaseGpuMemoryFence(kIgnoreBarrier, kSkipCpuWait);
size_t dstSize = size[0];
size_t tmpSize = 0;
size_t offset = 0;
@@ -309,7 +315,7 @@ bool DmaBlitManager::writeBufferRect(const void* srcHost, device::Memory& dstMem
const amd::BufferRect& hostRect,
const amd::BufferRect& bufRect, const amd::Coord3D& size,
bool entire) const {
// HSA copy functionality with a possible async operaiton, hence make sure GPU is done
// HSA copy functionality with a possible async operation
gpu().releaseGpuMemoryFence();
// Use host copy if memory has direct access
@@ -347,7 +353,7 @@ bool DmaBlitManager::writeBufferRect(const void* srcHost, device::Memory& dstMem
bool DmaBlitManager::writeImage(const void* srcHost, device::Memory& dstMemory,
const amd::Coord3D& origin, const amd::Coord3D& size,
size_t rowPitch, size_t slicePitch, bool entire) const {
// HSA copy functionality with a possible async operaiton, hence make sure GPU is done
// HSA copy functionality with a possible async operation
gpu().releaseGpuMemoryFence();
if (setup_.disableWriteImage_) {
@@ -365,12 +371,11 @@ bool DmaBlitManager::writeImage(const void* srcHost, device::Memory& dstMemory,
bool DmaBlitManager::copyBuffer(device::Memory& srcMemory, device::Memory& dstMemory,
const amd::Coord3D& srcOrigin, const amd::Coord3D& dstOrigin,
const amd::Coord3D& size, bool entire) const {
// HSA copy functionality with a possible async operaiton, hence make sure GPU is done
gpu().releaseGpuMemoryFence();
if (setup_.disableCopyBuffer_ ||
(srcMemory.isHostMemDirectAccess() && !srcMemory.isCpuUncached() &&
(dev().agent_profile() != HSA_PROFILE_FULL) && dstMemory.isHostMemDirectAccess())) {
// Stall GPU before CPU access
gpu().releaseGpuMemoryFence();
return HostBlitManager::copyBuffer(srcMemory, dstMemory, srcOrigin, dstOrigin, size);
} else {
return hsaCopy(gpuMem(srcMemory), gpuMem(dstMemory), srcOrigin, dstOrigin, size);
@@ -383,14 +388,14 @@ bool DmaBlitManager::copyBuffer(device::Memory& srcMemory, device::Memory& dstMe
bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& dstMemory,
const amd::BufferRect& srcRect, const amd::BufferRect& dstRect,
const amd::Coord3D& size, bool entire) const {
// HSA copy functionality with a possible async operaiton, hence make sure GPU is done
gpu().releaseGpuMemoryFence();
if (setup_.disableCopyBufferRect_ ||
(srcMemory.isHostMemDirectAccess() && !srcMemory.isCpuUncached() &&
dstMemory.isHostMemDirectAccess())) {
// Stall GPU before CPU access
gpu().releaseGpuMemoryFence();
return HostBlitManager::copyBufferRect(srcMemory, dstMemory, srcRect, dstRect, size, entire);
} else {
gpu().releaseGpuMemoryFence(kIgnoreBarrier, kSkipCpuWait);
void* src = gpuMem(srcMemory).getDeviceMemory();
void* dst = gpuMem(dstMemory).getDeviceMemory();
@@ -436,25 +441,21 @@ bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& d
}
if (isSubwindowRectCopy ) {
hsa_signal_store_relaxed(completion_signal_, kInitSignalValueOne);
hsa_signal_t wait = gpu().Barriers().WaitSignal();
hsa_signal_t active = gpu().Barriers().ActiveSignal(kInitSignalValueOne, gpu().timestamp());
// Copy memory line by line
hsa_status_t status =
hsa_amd_memory_async_copy_rect(&dstMem, &offset, &srcMem, &offset, &dim, agent,
direction, 0, nullptr, completion_signal_);
hsa_status_t status = hsa_amd_memory_async_copy_rect(&dstMem, &offset,
&srcMem, &offset, &dim, agent, direction, 1, &wait, active);
if (status != HSA_STATUS_SUCCESS) {
LogPrintfError("DMA buffer failed with code %d", status);
return false;
}
if (!WaitForSignal(completion_signal_)) {
LogError("Async copy failed");
return false;
}
} else {
// Fall to line by line copies
const hsa_signal_value_t kInitVal = size[2] * size[1];
hsa_signal_store_relaxed(completion_signal_, kInitVal);
hsa_signal_t wait = gpu().Barriers().WaitSignal();
hsa_signal_t active = gpu().Barriers().ActiveSignal(kInitVal, gpu().timestamp());
for (size_t z = 0; z < size[2]; ++z) {
for (size_t y = 0; y < size[1]; ++y) {
@@ -462,10 +463,10 @@ bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& d
size_t dstOffset = dstRect.offset(0, y, z);
// Copy memory line by line
hsa_status_t status =
hsa_amd_memory_async_copy((reinterpret_cast<address>(dst) + dstOffset), dstAgent,
(reinterpret_cast<const_address>(src) + srcOffset), srcAgent,
size[0], 0, nullptr, completion_signal_);
hsa_status_t status = hsa_amd_memory_async_copy(
(reinterpret_cast<address>(dst) + dstOffset), dstAgent,
(reinterpret_cast<const_address>(src) + srcOffset), srcAgent,
size[0], 1, &wait, active);
gpu().setLastCommandSDMA(true) ;
if (status != HSA_STATUS_SUCCESS) {
LogPrintfError("DMA buffer failed with code %d", status);
@@ -473,14 +474,10 @@ bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& d
}
}
}
if (!WaitForSignal(completion_signal_)) {
LogError("Async copy failed");
return false;
}
}
}
// Explicit wait for now, until runtime could distinguish compute and sdma operations
gpu().Barriers().WaitCurrent();
return true;
}
@@ -489,12 +486,9 @@ bool DmaBlitManager::copyImageToBuffer(device::Memory& srcMemory, device::Memory
const amd::Coord3D& srcOrigin, const amd::Coord3D& dstOrigin,
const amd::Coord3D& size, bool entire, size_t rowPitch,
size_t slicePitch) const {
// HSA copy functionality with a possible async operaiton, hence make sure GPU is done
if (!dev().settings().barrier_sync_ && !gpu().isLastCommandSDMA()) {
gpu().releaseGpuMemoryFence(true);
} else {
gpu().releaseGpuMemoryFence();
}
// HSA copy functionality with a possible async operation, hence make sure GPU is done
bool force_barrier = !dev().settings().barrier_sync_ && !gpu().isLastCommandSDMA();
gpu().releaseGpuMemoryFence(force_barrier);
bool result = false;
@@ -504,9 +498,6 @@ bool DmaBlitManager::copyImageToBuffer(device::Memory& srcMemory, device::Memory
} else {
Image& srcImage = static_cast<roc::Image&>(srcMemory);
Buffer& dstBuffer = static_cast<roc::Buffer&>(dstMemory);
// Use ROC path for a transfer
// Note: it doesn't support SDMA
address dstHost = reinterpret_cast<address>(dstBuffer.getDeviceMemory()) + dstOrigin[0];
// Use ROCm path for a transfer.
@@ -540,12 +531,9 @@ bool DmaBlitManager::copyBufferToImage(device::Memory& srcMemory, device::Memory
const amd::Coord3D& srcOrigin, const amd::Coord3D& dstOrigin,
const amd::Coord3D& size, bool entire, size_t rowPitch,
size_t slicePitch) const {
// HSA copy functionality with a possible async operaiton, hence make sure GPU is done
if (!dev().settings().barrier_sync_ && !gpu().isLastCommandSDMA()) {
gpu().releaseGpuMemoryFence(true);
} else {
gpu().releaseGpuMemoryFence();
}
// HSA copy functionality with a possible async operation, hence make sure GPU is done
bool force_barrier = !dev().settings().barrier_sync_ && !gpu().isLastCommandSDMA();
gpu().releaseGpuMemoryFence(force_barrier);
bool result = false;
@@ -588,7 +576,7 @@ bool DmaBlitManager::copyBufferToImage(device::Memory& srcMemory, device::Memory
bool DmaBlitManager::copyImage(device::Memory& srcMemory, device::Memory& dstMemory,
const amd::Coord3D& srcOrigin, const amd::Coord3D& dstOrigin,
const amd::Coord3D& size, bool entire) const {
// HSA copy functionality with a possible async operaiton, hence make sure GPU is done
// HSA copy functionality with a possible async operation, hence make sure GPU is done
gpu().releaseGpuMemoryFence();
bool result = false;
@@ -610,9 +598,8 @@ bool DmaBlitManager::hsaCopy(const Memory& srcMemory, const Memory& dstMemory,
address src = reinterpret_cast<address>(srcMemory.getDeviceMemory());
address dst = reinterpret_cast<address>(dstMemory.getDeviceMemory());
if (!dev().settings().barrier_sync_ && !gpu().isLastCommandSDMA()) {
gpu().releaseGpuMemoryFence(true);
}
bool force_barrier = !dev().settings().barrier_sync_ && !gpu().isLastCommandSDMA();
gpu().releaseGpuMemoryFence(force_barrier, kSkipCpuWait);
src += srcOrigin[0];
dst += dstOrigin[0];
@@ -620,6 +607,8 @@ bool DmaBlitManager::hsaCopy(const Memory& srcMemory, const Memory& dstMemory,
// Just call copy function for full profile
hsa_status_t status;
if (dev().agent_profile() == HSA_PROFILE_FULL) {
// Stall GPU, sicne CPU copy is possible
gpu().Barriers().WaitCurrent();
status = hsa_memory_copy(dst, src, size[0]);
if (status != HSA_STATUS_SUCCESS) {
LogPrintfError("Hsa copy of data failed with code %d", status);
@@ -649,21 +638,15 @@ bool DmaBlitManager::hsaCopy(const Memory& srcMemory, const Memory& dstMemory,
srcAgent = dstAgent = dev().getBackendDevice();
}
hsa_signal_store_relaxed(completion_signal_, kInitSignalValueOne);
hsa_signal_t wait = gpu().Barriers().WaitSignal();
hsa_signal_t active = gpu().Barriers().ActiveSignal(kInitSignalValueOne, gpu().timestamp());
// Use SDMA to transfer the data
status = hsa_amd_memory_async_copy(dst, dstAgent, src, srcAgent, size[0], 0, nullptr,
completion_signal_);
status = hsa_amd_memory_async_copy(dst, dstAgent, src, srcAgent, size[0], 1, &wait, active);
gpu().setLastCommandSDMA(true);
// Explicit wait for now, until runtime could distinguish compute and sdma operations
gpu().Barriers().WaitCurrent();
if (status == HSA_STATUS_SUCCESS) {
hsa_signal_value_t val;
if (!WaitForSignal(completion_signal_)) {
LogError("Async copy failed");
status = HSA_STATUS_ERROR;
} else {
gpu().addSystemScope();
}
gpu().addSystemScope();
} else {
LogPrintfError("Hsa copy from host to device failed with code %d", status);
}
@@ -674,6 +657,10 @@ bool DmaBlitManager::hsaCopy(const Memory& srcMemory, const Memory& dstMemory,
// ================================================================================================
bool DmaBlitManager::hsaCopyStaged(const_address hostSrc, address hostDst, size_t size,
address staging, bool hostToDev) const {
// Stall GPU, sicne CPU copy is possible
bool force_barrier = !dev().settings().barrier_sync_ && !gpu().isLastCommandSDMA();
gpu().releaseGpuMemoryFence(force_barrier);
// No allocation is necessary for Full Profile
hsa_status_t status;
if (dev().agent_profile() == HSA_PROFILE_FULL) {
@@ -688,14 +675,11 @@ bool DmaBlitManager::hsaCopyStaged(const_address hostSrc, address hostDst, size_
size_t offset = 0;
address hsaBuffer = staging;
if (!dev().settings().barrier_sync_ && !gpu().isLastCommandSDMA()) {
gpu().releaseGpuMemoryFence(true);
}
// Allocate requested size of memory
while (totalSize > 0) {
size = std::min(totalSize, dev().settings().stagedXferSize_);
hsa_signal_silent_store_relaxed(completion_signal_, kInitSignalValueOne);
hsa_signal_t active = gpu().Barriers().ActiveSignal(kInitSignalValueOne, gpu().timestamp());
// Copy data from Host to Device
if (hostToDev) {
@@ -707,17 +691,13 @@ bool DmaBlitManager::hsaCopyStaged(const_address hostSrc, address hostDst, size_
memcpy(hsaBuffer, hostSrc + offset, size);
status = hsa_amd_memory_async_copy(hostDst + offset, dev().getBackendDevice(), hsaBuffer,
srcAgent, size, 0, nullptr, completion_signal_);
srcAgent, size, 0, nullptr, active);
gpu().setLastCommandSDMA(true);
if (status == HSA_STATUS_SUCCESS) {
if (!WaitForSignal(completion_signal_)) {
LogError("Async copy failed");
return false;
}
} else {
if (status != HSA_STATUS_SUCCESS) {
LogPrintfError("Hsa copy from host to device failed with code %d", status);
return false;
}
gpu().Barriers().WaitCurrent();
totalSize -= size;
offset += size;
continue;
@@ -730,15 +710,11 @@ bool DmaBlitManager::hsaCopyStaged(const_address hostSrc, address hostDst, size_
(size <= dev().settings().sdmaCopyThreshold_) ? dev().getBackendDevice() : dev().getCpuAgent();
// Copy data from Device to Host
status =
hsa_amd_memory_async_copy(hsaBuffer, dstAgent, hostSrc + offset,
dev().getBackendDevice(), size, 0, nullptr, completion_signal_);
status = hsa_amd_memory_async_copy(hsaBuffer, dstAgent, hostSrc + offset,
dev().getBackendDevice(), size, 0, nullptr, active);
gpu().setLastCommandSDMA(true);
if (status == HSA_STATUS_SUCCESS) {
if (!WaitForSignal(completion_signal_)) {
LogError("Async copy failed");
return false;
}
gpu().Barriers().WaitCurrent();
memcpy(hostDst + offset, hsaBuffer, size);
} else {
LogPrintfError("Hsa copy from device to host failed with code %d", status);
@@ -1083,11 +1059,7 @@ bool KernelBlitManager::copyBufferToImageKernel(device::Memory& srcMemory,
releaseArguments(parameters);
if (releaseView) {
// todo SRD programming could be changed to avoid a stall
if(!dev().settings().barrier_sync_) {
gpu().releaseGpuMemoryFence(true);
} else {
gpu().releaseGpuMemoryFence();
}
gpu().releaseGpuMemoryFence();
dstView->owner()->release();
}
@@ -1285,11 +1257,7 @@ bool KernelBlitManager::copyImageToBufferKernel(device::Memory& srcMemory,
releaseArguments(parameters);
if (releaseView) {
// todo SRD programming could be changed to avoid a stall
if(!dev().settings().barrier_sync_) {
gpu().releaseGpuMemoryFence(true);
} else {
gpu().releaseGpuMemoryFence();
}
gpu().releaseGpuMemoryFence();
srcView->owner()->release();
}
@@ -1465,6 +1433,8 @@ bool KernelBlitManager::readImage(device::Memory& srcMemory, void* dstHost,
// Use host copy if memory has direct access
if (setup_.disableReadImage_ || (srcMemory.isHostMemDirectAccess() && !srcMemory.isCpuUncached())) {
// Stall GPU before CPU access
gpu().releaseGpuMemoryFence();
result = HostBlitManager::readImage(srcMemory, dstHost, origin, size, rowPitch, slicePitch, entire);
synchronize();
return result;
@@ -1510,6 +1480,8 @@ bool KernelBlitManager::writeImage(const void* srcHost, device::Memory& dstMemor
// Use host copy if memory has direct access
if (setup_.disableWriteImage_ || dstMemory.isHostMemDirectAccess()) {
// Stall GPU before CPU access
gpu().releaseGpuMemoryFence();
result = HostBlitManager::writeImage(srcHost, dstMemory, origin, size, rowPitch, slicePitch, entire);
synchronize();
return result;
@@ -1704,6 +1676,8 @@ bool KernelBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost,
// Use host copy if memory has direct access
if (setup_.disableReadBuffer_ || (srcMemory.isHostMemDirectAccess() && !srcMemory.isCpuUncached())) {
// Stall GPU before CPU access
gpu().releaseGpuMemoryFence();
result = HostBlitManager::readBuffer(srcMemory, dstHost, origin, size, entire);
synchronize();
return result;
@@ -1753,6 +1727,8 @@ bool KernelBlitManager::readBufferRect(device::Memory& srcMemory, void* dstHost,
// Use host copy if memory has direct access
if (setup_.disableReadBufferRect_ ||
(srcMemory.isHostMemDirectAccess() && !srcMemory.isCpuUncached())) {
// Stall GPU before CPU access
gpu().releaseGpuMemoryFence();
result = HostBlitManager::readBufferRect(srcMemory, dstHost, bufRect, hostRect, size, entire);
synchronize();
return result;
@@ -1814,6 +1790,8 @@ bool KernelBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemo
// Use host copy if memory has direct access
if (setup_.disableWriteBuffer_ || dstMemory.isHostMemDirectAccess() ||
gpuMem(dstMemory).IsPersistentDirectMap()) {
// Stall GPU before CPU access
gpu().releaseGpuMemoryFence();
result = HostBlitManager::writeBuffer(srcHost, dstMemory, origin, size, entire);
synchronize();
return result;
@@ -1864,6 +1842,8 @@ bool KernelBlitManager::writeBufferRect(const void* srcHost, device::Memory& dst
// Use host copy if memory has direct access
if (setup_.disableWriteBufferRect_ || dstMemory.isHostMemDirectAccess() ||
gpuMem(dstMemory).IsPersistentDirectMap()) {
// Stall GPU before CPU access
gpu().releaseGpuMemoryFence();
result = HostBlitManager::writeBufferRect(srcHost, dstMemory, hostRect, bufRect, size, entire);
synchronize();
return result;
@@ -1913,6 +1893,8 @@ bool KernelBlitManager::fillBuffer(device::Memory& memory, const void* pattern,
// Use host fill if memory has direct access
if (setup_.disableFillBuffer_ || memory.isHostMemDirectAccess()) {
// Stall GPU before CPU access
gpu().releaseGpuMemoryFence();
result = HostBlitManager::fillBuffer(memory, pattern, patternSize, origin, size, entire);
synchronize();
return result;
@@ -2074,6 +2056,8 @@ bool KernelBlitManager::fillImage(device::Memory& memory, const void* pattern,
// Use host fill if memory has direct access
if (setup_.disableFillImage_ || memory.isHostMemDirectAccess()) {
// Stall GPU before CPU access
gpu().releaseGpuMemoryFence();
result = HostBlitManager::fillImage(memory, pattern, origin, size, entire);
synchronize();
return result;
+5 -14
View File
@@ -40,6 +40,9 @@ class Kernel;
class Memory;
class VirtualGPU;
constexpr bool kSkipCpuWait = true;
constexpr bool kIgnoreBarrier = false;
//! DMA Blit Manager
class DmaBlitManager : public device::HostBlitManager {
public:
@@ -49,19 +52,10 @@ class DmaBlitManager : public device::HostBlitManager {
);
//! Destructor
virtual ~DmaBlitManager() {
if (completion_signal_.handle != 0) {
hsa_signal_destroy(completion_signal_);
}
}
virtual ~DmaBlitManager() {}
//! Creates DmaBlitManager object
virtual bool create(amd::Device& device) {
if (HSA_STATUS_SUCCESS != hsa_signal_create(0, 0, nullptr, &completion_signal_)) {
return false;
}
return true;
}
virtual bool create(amd::Device& device) { return true; }
//! Copies a buffer object to system memory
virtual bool readBuffer(device::Memory& srcMemory, //!< Source memory object
@@ -225,9 +219,6 @@ class DmaBlitManager : public device::HostBlitManager {
size_t xferSize //!< Transfer size
) const;
//! Handle of ROC Device object
hsa_signal_t completion_signal_;
//! Assits in transferring data from Host to Local or vice versa
//! taking into account the Hsail profile supported by Hsa Agent
bool hsaCopyStaged(const_address hostSrc, //!< Contains source data to be copied
@@ -260,7 +260,12 @@ Device::~Device() {
context().svmFree(coopHostcallBuffer_);
coopHostcallBuffer_ = nullptr;
}
if (0 != prefetch_signal_.handle) {
hsa_signal_destroy(prefetch_signal_);
}
}
bool NullDevice::initCompiler(bool isOffline) {
#if defined(WITH_COMPILER_LIB)
// Initialize the compiler handle if has already not been initialized
@@ -199,8 +199,9 @@ void Memory::cpuUnmap(device::VirtualDevice& vDev) {
amd::Coord3D(size()), true)) {
LogError("[OCL] Fail sync the device memory on cpuUnmap");
}
// Wait on CPU for the transfer
static_cast<roc::VirtualGPU&>(vDev).releaseGpuMemoryFence();
}
decIndMapCount();
}
+60 -167
View File
@@ -456,34 +456,12 @@ bool VirtualGPU::dispatchGenericAqlPacket(
// Check for queue full and wait if needed.
uint64_t index = hsa_queue_add_write_index_screlease(gpu_queue_, size);
uint64_t read = hsa_queue_load_read_index_relaxed(gpu_queue_);
hsa_signal_t signal;
// TODO: placeholder to setup the kernel to populate start and end timestamp.
if (timestamp_ != nullptr) {
// Pool size must grow to the size of pending AQL packets
const uint32_t pool_size = index - read;
if (pool_size >= signal_pool_.size()) {
ProfilingSignal profiling_signal = {};
if (HSA_STATUS_SUCCESS != hsa_signal_create(0, 0, nullptr, &profiling_signal.signal_)) {
LogPrintfError("Failed signal allocation id = %d", pool_size);
return false;
}
signal_pool_.push_back(profiling_signal);
assert(queueSize >= signal_pool_.size() && "Pool will be reallocated!");
}
// Move index inside the valid pool
++current_signal_ %= signal_pool_.size();
// Find signal slot
ProfilingSignal* profilingSignal = &signal_pool_[current_signal_];
// Make sure we save the old results in the TS structure
if (profilingSignal->ts_ != nullptr) {
profilingSignal->ts_->checkGpuTime();
}
// Update the new TS with the signal info
timestamp_->setProfilingSignal(profilingSignal);
packet->completion_signal = profilingSignal->signal_;
profilingSignal->ts_ = timestamp_;
timestamp_->setAgent(gpu_device_);
// Get active signal for current dispatch if profiling is necessary
packet->completion_signal = Barriers().ActiveSignal(kInitSignalValueOne, timestamp_, pool_size);
}
// Make sure the slot is free for usage
@@ -494,23 +472,11 @@ bool VirtualGPU::dispatchGenericAqlPacket(
// Add blocking command if the original value of read index was behind of the queue size
if (blocking || (index - read) >= queueMask) {
if (packet->completion_signal.handle == 0) {
packet->completion_signal = barrier_signal_;
packet->completion_signal = Barriers().ActiveSignal();
}
signal = packet->completion_signal;
// Initialize signal for a wait
hsa_signal_store_relaxed(signal, kInitSignalValueOne);
blocking = true;
}
// If runtime doesn't use the barrier, then make sure it tracks the last submitted command
if (!dev().settings().barrier_sync_) {
// Initialize signal for a wait
assert(packet->completion_signal.handle != 0 &&
"There is no HSA signal associated with the last command!");
hsa_signal_store_relaxed(packet->completion_signal, kInitSignalValueOne);
last_signal_ = packet->completion_signal;
}
// Insert packet(s)
// NOTE: need multiple packets to dispatch the performance counter
// packet blob of the legacy devices (gfx8)
@@ -550,12 +516,10 @@ bool VirtualGPU::dispatchGenericAqlPacket(
// Wait on signal ?
if (blocking) {
if (!WaitForSignal(signal)) {
LogPrintfError("Failed signal [0x%lx] wait", signal.handle);
if (!Barriers().WaitCurrent()) {
LogPrintfError("Failed blocking queue wait with signal [0x%lx]", packet->completion_signal.handle);
return false;
}
// Reset the pool of signals
current_signal_ = 0;
}
return true;
@@ -601,6 +565,8 @@ bool VirtualGPU::dispatchCounterAqlPacket(hsa_ext_amd_aql_pm4_packet_t* packet,
return false;
}
// ================================================================================================
void VirtualGPU::dispatchBarrierPacket(const hsa_barrier_and_packet_t* packet) {
assert(packet->completion_signal.handle != 0);
const uint32_t queueSize = gpu_queue_->size;
@@ -631,6 +597,7 @@ void VirtualGPU::dispatchBarrierPacket(const hsa_barrier_and_packet_t* packet) {
packet->dep_signal[3], packet->dep_signal[4], packet->completion_signal);
}
// ================================================================================================
void VirtualGPU::dispatchGenericBarrierPacket(hsa_barrier_and_packet_t* packet,
uint16_t packetHeader, hsa_signal_t signal) {
const uint32_t queueSize = gpu_queue_->size;
@@ -641,30 +608,8 @@ void VirtualGPU::dispatchGenericBarrierPacket(hsa_barrier_and_packet_t* packet,
if (signal.handle == 0) {
// Pool size must grow to the size of pending AQL packets
const uint32_t pool_size = index - read;
if (pool_size >= signal_pool_.size()) {
ProfilingSignal profiling_signal = {};
if (HSA_STATUS_SUCCESS != hsa_signal_create(0, 0, nullptr, &profiling_signal.signal_)) {
LogPrintfError("Failed signal allocation id = %d", pool_size);
}
signal_pool_.push_back(profiling_signal);
assert(queueSize >= signal_pool_.size() && "Pool will be reallocated!");
}
// Move index inside the valid pool
++current_signal_ %= signal_pool_.size();
// Find signal slot
ProfilingSignal* profilingSignal = &signal_pool_[current_signal_];
// Make sure we save the old results in the TS structure
if (profilingSignal->ts_ != nullptr) {
profilingSignal->ts_->checkGpuTime();
}
if (timestamp_ != nullptr) {
// Update the new TS with the signal info
timestamp_->setProfilingSignal(profilingSignal);
profilingSignal->ts_ = timestamp_;
timestamp_->setAgent(gpu_device_);
}
packet->completion_signal = profilingSignal->signal_;
hsa_signal_store_relaxed(profilingSignal->signal_, kInitSignalValueOne);
// Get active signal for current dispatch if profiling is necessary
packet->completion_signal = Barriers().ActiveSignal(kInitSignalValueOne, timestamp_, pool_size);
} else {
assert(signal.handle != 0);
packet->completion_signal = signal;
@@ -705,44 +650,25 @@ void VirtualGPU::ResetQueueStates() {
// Release the pool, since runtime just completed a barrier
// @note: Runtime can reset kernel arg pool only if the barrier with L2 invalidation was issued
resetKernArgPool();
} else {
// Reset the pool of signals
current_signal_ = 0;
}
}
// ================================================================================================
bool VirtualGPU::releaseGpuMemoryFence(bool force_barrier) {
// Return if there is no pending dispatch
if (!hasPendingDispatch_) {
if (dev().settings().barrier_sync_ || !force_barrier) {
return false;
}
}
hsa_signal_t wait_signal = barrier_signal_;
bool VirtualGPU::releaseGpuMemoryFence(bool force_barrier, bool skip_cpu_wait) {
if (hasPendingDispatch_ && (dev().settings().barrier_sync_ || force_barrier)) {
barrier_packet_.completion_signal = Barriers().ActiveSignal();
// If barrier sync was requested or runtime didn't provide the last signal
if (dev().settings().barrier_sync_ || force_barrier) {
// Initialize signal for the barrier packet.
hsa_signal_store_relaxed(barrier_signal_, kInitSignalValueOne);
// Dispatch barrier packet into the queue and wait till it finishes.
// Dispatch barrier packet into the queue
dispatchBarrierPacket(&barrier_packet_);
}
else {
// Take the signal of the last submitted dispatch
wait_signal = last_signal_;
hasPendingDispatch_ = false;
}
// Wait for compute work previously submitted
if (!WaitForSignal(wait_signal)) {
LogError("Waiting for compute work failed!");
return false;
// Check if runtime could skip CPU wait
if (!skip_cpu_wait) {
Barriers().WaitCurrent();
ResetQueueStates();
}
hasPendingDispatch_ = false;
ResetQueueStates();
return true;
}
@@ -800,7 +726,6 @@ VirtualGPU::VirtualGPU(Device& device, bool profiling, bool cooperative,
}
aqlHeader_ = dispatchPacketHeader_;
barrier_signal_.handle = 0;
// Note: Virtual GPU device creation must be a thread safe operation
roc_device_.vgpus_.resize(roc_device_.numOfVgpus_);
@@ -808,16 +733,13 @@ VirtualGPU::VirtualGPU(Device& device, bool profiling, bool cooperative,
}
// ================================================================================================
VirtualGPU::~VirtualGPU() {
delete blitMgr_;
// Release the resources of signal
releaseGpuMemoryFence();
if (barrier_signal_.handle != 0) {
hsa_signal_destroy(barrier_signal_);
}
destroyPool();
releasePinnedMem();
@@ -868,8 +790,7 @@ bool VirtualGPU::create() {
gpu_queue_ = roc_device_.acquireQueue(queue_size, cooperative_, cuMask_, priority_);
if (!gpu_queue_) return false;
if (!initPool(dev().settings().kernargPoolSize_,
(profiling_ || (amd::IS_HIP)) ? queue_size : 0)) {
if (!initPool(dev().settings().kernargPoolSize_)) {
LogError("Couldn't allocate arguments/signals for the queue");
return false;
}
@@ -881,17 +802,9 @@ bool VirtualGPU::create() {
return false;
}
// Create signal for the barrier packet.
hsa_signal_t signal = {0};
if (HSA_STATUS_SUCCESS != hsa_signal_create(kInitSignalValueOne, 0, nullptr, &signal)) {
return false;
}
barrier_signal_ = signal;
// Initialize barrier packet.
memset(&barrier_packet_, 0, sizeof(barrier_packet_));
barrier_packet_.header = kInvalidAql;
barrier_packet_.completion_signal = barrier_signal_;
// Create a object of PrintfDbg
printfdbg_ = new PrintfDbg(roc_device_);
@@ -912,62 +825,32 @@ bool VirtualGPU::create() {
return false;
}
// Allocate signal tracker for ROCr copy queue
if (!Barriers().Create(gpu_device())) {
LogError("Could not create signal for copy queue!");
return false;
}
return true;
}
bool VirtualGPU::initPool(size_t kernarg_pool_size, uint signal_pool_count) {
// ================================================================================================
bool VirtualGPU::initPool(size_t kernarg_pool_size) {
kernarg_pool_size_ = kernarg_pool_size;
kernarg_pool_base_ = reinterpret_cast<char*>(roc_device_.hostAlloc(kernarg_pool_size_, false));
if (kernarg_pool_base_ == nullptr) {
return false;
}
// Optimization :
// For better resource utilization runtime should create them only when required
// In case of HIP, Apps create short live streams which do not need more signals
// hence starting with smaller number 32. There is code inplace to grow the pool
// later when it is needed.
bool forced_default_pool_sz = false;
if (!profiling_ && (amd::IS_HIP)) {
forced_default_pool_sz = true;
}
if (signal_pool_count != 0) {
// Reserve signal pool for all entries in the queue, since profiling logic will save the
// pointer in timestamp info for the future references
signal_pool_.reserve(signal_pool_count);
// If barrier is disable, then allocate a small portion of all signals and grow the array later.
// @note: the optimization requires a wait for signal on reuse, which is only available when
// the barrier is disabled
constexpr uint32_t kDefaultSignalPoolSize = 32;
const uint32_t default_signal_pool_size =
(dev().settings().barrier_sync_ && !forced_default_pool_sz) ?
signal_pool_count : kDefaultSignalPoolSize;
signal_pool_.resize(default_signal_pool_size);
for (uint i = 0; i < default_signal_pool_size; ++i) {
ProfilingSignal profilingSignal;
if (HSA_STATUS_SUCCESS != hsa_signal_create(0, 0, nullptr, &profilingSignal.signal_)) {
return false;
}
signal_pool_[i] = profilingSignal;
}
}
return true;
}
// ================================================================================================
void VirtualGPU::destroyPool() {
if (kernarg_pool_base_ != nullptr) {
roc_device_.hostFree(kernarg_pool_base_, kernarg_pool_size_);
}
if (signal_pool_.size() > 0) {
for (uint i = 0; i < signal_pool_.size(); ++i) {
hsa_signal_destroy(signal_pool_[i].signal_);
}
}
}
// ================================================================================================
void* VirtualGPU::allocKernArg(size_t size, size_t alignment) {
char* result = nullptr;
do {
@@ -982,24 +865,21 @@ void* VirtualGPU::allocKernArg(size_t size, size_t alignment) {
//! We can issue a barrier to avoid expensive extra memory allocations.
// Initialize signal for the barrier packet.
hsa_signal_store_relaxed(barrier_signal_, kInitSignalValueOne);
barrier_packet_.completion_signal = Barriers().ActiveSignal();
// Dispatch barrier packet into the queue and wait till it finishes.
dispatchBarrierPacket(&barrier_packet_);
if (!WaitForSignal(barrier_signal_)) {
if (!Barriers().WaitCurrent()) {
LogError("Kernel arguments reset failed");
}
resetKernArgPool();
// Reset the pool of signals
current_signal_ = 0;
}
} while (true);
return result;
}
// ================================================================================================
/* profilingBegin, when profiling is enabled, creates a timestamp to save in
* virtualgpu's timestamp_, and calls start() to get the current host
* timestamp.
@@ -1007,9 +887,8 @@ void* VirtualGPU::allocKernArg(size_t size, size_t alignment) {
void VirtualGPU::profilingBegin(amd::Command& command, bool drmProfiling) {
if (command.profilingInfo().enabled_) {
if (timestamp_ != nullptr) {
LogWarning(
"Trying to create a second timestamp in VirtualGPU. \
This could have unintended consequences.");
LogWarning("Trying to create a second timestamp in VirtualGPU. \
This could have unintended consequences.");
return;
}
// Without barrier profiling will wait for each individual signal
@@ -1018,6 +897,7 @@ void VirtualGPU::profilingBegin(amd::Command& command, bool drmProfiling) {
}
}
// ================================================================================================
/* profilingEnd, when profiling is enabled, checks to see if a signal was
* created for whatever command we are running and calls end() to get the
* current host timestamp if no signal is available. It then saves the pointer
@@ -1033,6 +913,7 @@ void VirtualGPU::profilingEnd(amd::Command& command) {
}
}
// ================================================================================================
void VirtualGPU::updateCommandsState(amd::Command* list) {
Timestamp* ts = nullptr;
@@ -1335,8 +1216,10 @@ void VirtualGPU::submitSvmFreeMemory(amd::SvmFreeMemoryCommand& cmd) {
// ================================================================================================
void VirtualGPU::submitSvmPrefetchAsync(amd::SvmPrefetchAsyncCommand& cmd) {
#if AMD_HMM_SUPPORT
profilingBegin(cmd);
// Initialize signal for the barrier
hsa_signal_store_relaxed(barrier_signal_, kInitSignalValueOne);
hsa_signal_t wait = Barriers().WaitSignal();
hsa_signal_t active = Barriers().ActiveSignal(kInitSignalValueOne, timestamp_);
// Find the requested agent for the transfer
hsa_agent_t agent = (cmd.cpu_access() ||
@@ -1345,16 +1228,18 @@ void VirtualGPU::submitSvmPrefetchAsync(amd::SvmPrefetchAsyncCommand& cmd) {
// Initiate a prefetch command
hsa_status_t status = hsa_amd_svm_prefetch_async(
const_cast<void*>(cmd.dev_ptr()), cmd.count(), agent, 0, nullptr, barrier_signal_);
const_cast<void*>(cmd.dev_ptr()), cmd.count(), agent, 1, &wait, active);
// Wait for the prefetch
if ((status != HSA_STATUS_SUCCESS) || !WaitForSignal(barrier_signal_)) {
// Wait for the prefetch. Should skip wait, but may require extra tracking for kernel execution.
if ((status != HSA_STATUS_SUCCESS) || !Barriers().WaitCurrent()) {
LogError("hsa_amd_svm_prefetch_async failed");
cmd.setStatus(CL_INVALID_OPERATION);
}
// Add system scope, since the prefetch scope is unclear
addSystemScope();
profilingEnd(cmd);
#endif // AMD_HMM_SUPPORT
}
@@ -2490,9 +2375,10 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
}
if (gpuKernel.dynamicParallelism()) {
barrier_packet_.completion_signal.handle = 0;
dispatchBarrierPacket(&barrier_packet_);
static_cast<KernelBlitManager&>(blitMgr()).runScheduler(
getVQVirtualAddress(), schedulerParam_, schedulerQueue_, schedulerSignal_, schedulerThreads_);
getVQVirtualAddress(), schedulerParam_, schedulerQueue_, schedulerSignal_, schedulerThreads_);
}
// Check if image buffer write back is required
@@ -2594,9 +2480,6 @@ void VirtualGPU::submitMarker(amd::Marker& vcmd) {
uint16_t header = kNopPacketHeader;
hsa_signal_t sig { 0 };
dispatchGenericBarrierPacket(&barrier_packet_, header, sig);
last_signal_ = barrier_packet_.completion_signal;
// Restore barrier signal
barrier_packet_.completion_signal = barrier_signal_;
}
profilingEnd(vcmd);
}
@@ -2618,8 +2501,8 @@ void VirtualGPU::submitReleaseExtObjects(amd::ReleaseExtObjectsCommand& vcmd) {
profilingBegin(vcmd);
if (!dev().settings().barrier_sync_) {
// Force barrier to make sure L2 flush, since interop can be in sysmem
constexpr bool ForceBarrier = true;
releaseGpuMemoryFence(ForceBarrier);
constexpr bool kForceBarrier = true;
releaseGpuMemoryFence(kForceBarrier);
}
profilingEnd(vcmd);
}
@@ -2644,6 +2527,9 @@ void VirtualGPU::flush(amd::Command* list, bool wait) {
// ================================================================================================
void VirtualGPU::addXferWrite(Memory& memory) {
//! @note: ROCr backend doesn't have per resource busy tracking, hence runtime has to wait
//! unconditionally, before it can release pinned memory
releaseGpuMemoryFence();
if (xferWriteBuffers_.size() > 7) {
dev().xferWrite().release(*this, *xferWriteBuffers_.front());
xferWriteBuffers_.erase(xferWriteBuffers_.begin());
@@ -2653,6 +2539,7 @@ void VirtualGPU::addXferWrite(Memory& memory) {
xferWriteBuffers_.push_back(&memory);
}
// ================================================================================================
void VirtualGPU::releaseXferWrite() {
for (auto& memory : xferWriteBuffers_) {
dev().xferWrite().release(*this, *memory);
@@ -2660,7 +2547,11 @@ void VirtualGPU::releaseXferWrite() {
xferWriteBuffers_.resize(0);
}
// ================================================================================================
void VirtualGPU::addPinnedMem(amd::Memory* mem) {
//! @note: ROCr backend doesn't have per resource busy tracking, hence runtime has to wait
//! unconditionally, before it can release pinned memory
releaseGpuMemoryFence();
if (nullptr == findPinnedMem(mem->getHostMem(), mem->getSize())) {
if (pinnedMems_.size() > 7) {
pinnedMems_.front()->release();
@@ -2672,6 +2563,7 @@ void VirtualGPU::addPinnedMem(amd::Memory* mem) {
}
}
// ================================================================================================
void VirtualGPU::releasePinnedMem() {
for (auto& amdMemory : pinnedMems_) {
amdMemory->release();
@@ -2679,6 +2571,7 @@ void VirtualGPU::releasePinnedMem() {
pinnedMems_.resize(0);
}
// ================================================================================================
amd::Memory* VirtualGPU::findPinnedMem(void* addr, size_t size) {
for (auto& amdMemory : pinnedMems_) {
if ((amdMemory->getHostMem() == addr) && (size <= amdMemory->getSize())) {
+134 -13
View File
@@ -36,10 +36,11 @@ class Memory;
class Timestamp;
struct ProfilingSignal : public amd::HeapObject {
hsa_signal_t signal_; //!< HSA signal to track profiling information
Timestamp* ts_; //!< Timestamp object associated with the signal
hsa_signal_t signal_; //!< HSA signal to track profiling information
Timestamp* ts_; //!< Timestamp object associated with the signal
bool done_; //!< True if signal is done
ProfilingSignal() : ts_(nullptr) { signal_.handle = 0; }
ProfilingSignal() : ts_(nullptr), done_(true) { signal_.handle = 0; }
};
// Initial HSA signal value
@@ -111,13 +112,19 @@ class Timestamp {
hsa_amd_profiling_dispatch_time_t time;
if (splittedDispatch_) {
uint64_t start = UINT64_MAX;
uint64_t start = std::numeric_limits<uint64_t>::max();
uint64_t end = 0;
for (auto it = splittedSignals_.begin(); it < splittedSignals_.end(); it++) {
if (hsa_signal_load_relaxed(profilingSignal_->signal_) > 0) {
WaitForSignal(*it);
}
hsa_amd_profiling_get_dispatch_time(agent_, *it, &time);
if ((time.end - time.start) == 0) {
hsa_amd_profiling_async_copy_time_t time_sdma = {};
hsa_amd_profiling_get_async_copy_time(profilingSignal_->signal_, &time_sdma);
time.start = time_sdma.start;
time.end = time_sdma.end;
}
if (time.start < start) {
start = time.start;
}
@@ -133,10 +140,18 @@ class Timestamp {
WaitForSignal(profilingSignal_->signal_);
}
hsa_amd_profiling_get_dispatch_time(agent_, profilingSignal_->signal_, &time);
start_ = time.start * ticksToTime_;
end_ = time.end * ticksToTime_;
if ((time.end - time.start) == 0) {
hsa_amd_profiling_async_copy_time_t time_sdma = {};
hsa_amd_profiling_get_async_copy_time(profilingSignal_->signal_, &time_sdma);
start_ = time_sdma.start * ticksToTime_;
end_ = time_sdma.end * ticksToTime_;
} else {
start_ = time.start * ticksToTime_;
end_ = time.end * ticksToTime_;
}
}
profilingSignal_->ts_ = nullptr;
profilingSignal_->done_ = true;
profilingSignal_ = nullptr;
}
}
@@ -192,6 +207,109 @@ class VirtualGPU : public device::VirtualDevice {
size_t maxMemObjectsInQueue_; //!< Maximum number of mem objects in the queue
};
class HwQueueTracker : public amd::EmbeddedObject {
public:
HwQueueTracker() {}
~HwQueueTracker() {
for (auto& signal: signal_list_) {
if (signal->signal_.handle != 0) {
hsa_signal_destroy(signal->signal_);
}
delete signal;
}
}
//! Creates a pool of signals for tracking of HW operations on the queue
bool Create(hsa_agent_t agent) {
constexpr size_t kSignalListSize = 16;
signal_list_.resize(kSignalListSize);
for (uint i = 0; i < kSignalListSize; ++i) {
ProfilingSignal* signal = new ProfilingSignal();
if ((signal == nullptr) || (HSA_STATUS_SUCCESS != hsa_signal_create(
0, 1, &agent, &signal->signal_))) {
return false;
}
signal_list_[i] = signal;
}
agent_ = agent;
return true;
}
//! Finds a free signal for the upcomming operation
hsa_signal_t ActiveSignal(hsa_signal_value_t init_val = kInitSignalValueOne,
Timestamp* ts = nullptr, uint32_t queue_size = 0) {
// If queue size grows, then add more signals to avoid more frequent stalls
if (queue_size > signal_list_.size()) {
ProfilingSignal* signal = new ProfilingSignal();
if (signal != nullptr) {
if (HSA_STATUS_SUCCESS == hsa_signal_create(
0, 1, &agent_, &signal->signal_)) {
signal_list_.push_back(signal);
}
}
}
// Find valid index
++current_id_ %= signal_list_.size();
// Make sure the previous operation on the current signal is done
WaitCurrent();
// Have to wait the next signal in the queue to avoid a race condition between
// a GPU waiter(which may be not triggered yet) and CPU signal reset below
WaitNext();
// Reset the signal and return
hsa_signal_silent_store_relaxed(signal_list_[current_id_]->signal_, init_val);
signal_list_[current_id_]->done_ = false;
if (ts != 0) {
if (!sdma_profiling_) {
hsa_amd_profiling_async_copy_enable(true);
sdma_profiling_ = true;
}
signal_list_[current_id_]->ts_ = ts;
ts->setProfilingSignal(signal_list_[current_id_]);
ts->setAgent(agent_);
}
return signal_list_[current_id_]->signal_;
}
//! Wait for the curent active signal. Can idle the queue
bool WaitCurrent() { return WaitIndex(current_id_); }
//! Returns the last submitted signal for a wait
hsa_signal_t WaitSignal() const { return signal_list_[current_id_]->signal_; }
private:
//! Wait for the next active signal
void WaitNext() {
size_t next = (current_id_ + 1) % signal_list_.size();
WaitIndex(next);
}
//! Wait for the provided signal
bool WaitIndex(size_t index) {
// Wait for the current signal
if (!signal_list_[index]->done_) {
// Update timestamp values if requested
if (signal_list_[index]->ts_ != nullptr) {
signal_list_[index]->ts_->checkGpuTime();
} else {
if (!WaitForSignal(signal_list_[index]->signal_)) {
LogPrintfError("Failed signal [0x%lx] wait", signal_list_[index]->signal_);
return false;
}
signal_list_[index]->done_ = true;
}
}
return true;
}
std::vector<ProfilingSignal*> signal_list_; //!< The pool of all signals for processing
size_t current_id_ = 0; //!< Last submitted signal
hsa_agent_t agent_; //!< HSA device agent
bool sdma_profiling_ = false; //!< Don't enable SDMA profiling by default
};
VirtualGPU(Device& device, bool profiling = false, bool cooperative = false,
const std::vector<uint32_t>& cuMask = {},
amd::CommandQueue::Priority priority = amd::CommandQueue::Priority::Normal);
@@ -256,7 +374,7 @@ class VirtualGPU : public device::VirtualDevice {
*
* @return bool true if Wait returned successfully, false otherwise
*/
bool releaseGpuMemoryFence(bool force_barrier = false);
bool releaseGpuMemoryFence(bool force_barrier = false, bool skip_copy_wait = false);
hsa_agent_t gpu_device() { return gpu_device_; }
hsa_queue_t* gpu_queue() { return gpu_queue_; }
@@ -297,6 +415,10 @@ class VirtualGPU : public device::VirtualDevice {
void addSystemScope() { addSystemScope_ = true; }
void SetCopyCommandType(cl_command_type type) { copy_command_type_ = type; }
HwQueueTracker& Barriers() { return barriers_; }
Timestamp* timestamp() const { return timestamp_; }
// } roc OpenCL integration
private:
bool dispatchAqlPacket(hsa_kernel_dispatch_packet_t* packet, uint16_t header,
@@ -316,7 +438,7 @@ class VirtualGPU : public device::VirtualDevice {
void initializeDispatchPacket(hsa_kernel_dispatch_packet_t* packet,
amd::NDRangeContainer& sizes);
bool initPool(size_t kernarg_pool_size, uint signal_pool_count);
bool initPool(size_t kernarg_pool_size);
void destroyPool();
void* allocKernArg(size_t size, size_t alignment);
@@ -368,7 +490,7 @@ class VirtualGPU : public device::VirtualDevice {
uint32_t cooperative_ : 1; //!< Cooperative launch is enabled
uint32_t addSystemScope_ : 1; //!< Insert a system scope to the next aql
uint32_t isLastCommandSDMA_ : 1; //!< Keep track if the last command was SDMA and
//!< not send Barrier packets if barrier_sync is 0
//!< not send Barrier packets if barrier_sync is 0
};
uint32_t state_;
};
@@ -379,8 +501,7 @@ class VirtualGPU : public device::VirtualDevice {
hsa_agent_t gpu_device_; //!< Physical device
hsa_queue_t* gpu_queue_; //!< Queue associated with a gpu
hsa_barrier_and_packet_t barrier_packet_;
hsa_signal_t barrier_signal_;
hsa_signal_t last_signal_ = {}; //!< Last submitted signal
uint32_t dispatch_id_; //!< This variable must be updated atomically.
Device& roc_device_; //!< roc device object
PrintfDbg* printfdbg_;
@@ -396,12 +517,12 @@ class VirtualGPU : public device::VirtualDevice {
hsa_queue_t* schedulerQueue_;
hsa_signal_t schedulerSignal_;
HwQueueTracker barriers_; //!< Tracks active barriers in ROCr
char* kernarg_pool_base_;
size_t kernarg_pool_size_;
uint kernarg_pool_cur_offset_;
std::vector<ProfilingSignal> signal_pool_; //!< Pool of signals for profiling
uint32_t current_signal_ = 0; //!< Current avaialble signal in the pool
friend class Timestamp;
// PM4 packet for gfx8 performance counter