Add HSA signal global tracking logic.
Implement the global class for signals tracking per device queue.
Switch to the new tracking mechanism.
Change-Id: I3c4dda04b34e6d18d6a95510d84102909633b415
[ROCm/clr commit: 8698aeef0d]
This commit is contained in:
@@ -62,12 +62,14 @@ bool DmaBlitManager::readMemoryStaged(Memory& srcMemory, void* dstHost, Memory&
|
||||
bool DmaBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost,
|
||||
const amd::Coord3D& origin, const amd::Coord3D& size,
|
||||
bool entire) const {
|
||||
// HSA copy functionality with a possible async operaiton, hence make sure GPU is done
|
||||
gpu().releaseGpuMemoryFence();
|
||||
// HSA copy functionality with a possible async operation
|
||||
gpu().releaseGpuMemoryFence(kIgnoreBarrier, kSkipCpuWait);
|
||||
|
||||
// Use host copy if memory has direct access
|
||||
if (setup_.disableReadBuffer_ ||
|
||||
(srcMemory.isHostMemDirectAccess() && !srcMemory.isCpuUncached())) {
|
||||
// Stall GPU before CPU access
|
||||
gpu().Barriers().WaitCurrent();
|
||||
return HostBlitManager::readBuffer(srcMemory, dstHost, origin, size, entire);
|
||||
} else {
|
||||
size_t srcSize = size[0];
|
||||
@@ -149,12 +151,14 @@ bool DmaBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost,
|
||||
bool DmaBlitManager::readBufferRect(device::Memory& srcMemory, void* dstHost,
|
||||
const amd::BufferRect& bufRect, const amd::BufferRect& hostRect,
|
||||
const amd::Coord3D& size, bool entire) const {
|
||||
// HSA copy functionality with a possible async operaiton, hence make sure GPU is done
|
||||
// HSA copy functionality with a possible async operation
|
||||
gpu().releaseGpuMemoryFence();
|
||||
|
||||
// Use host copy if memory has direct access
|
||||
if (setup_.disableReadBufferRect_ ||
|
||||
(srcMemory.isHostMemDirectAccess() && !srcMemory.isCpuUncached())) {
|
||||
// Stall GPU before CPU access
|
||||
gpu().Barriers().WaitCurrent();
|
||||
return HostBlitManager::readBufferRect(srcMemory, dstHost, bufRect, hostRect, size, entire);
|
||||
} else {
|
||||
Memory& xferBuf = dev().xferRead().acquire();
|
||||
@@ -187,7 +191,7 @@ bool DmaBlitManager::readBufferRect(device::Memory& srcMemory, void* dstHost,
|
||||
bool DmaBlitManager::readImage(device::Memory& srcMemory, void* dstHost, const amd::Coord3D& origin,
|
||||
const amd::Coord3D& size, size_t rowPitch, size_t slicePitch,
|
||||
bool entire) const {
|
||||
// HSA copy functionality with a possible async operaiton, hence make sure GPU is done
|
||||
// HSA copy functionality with a possible async operation
|
||||
gpu().releaseGpuMemoryFence();
|
||||
|
||||
if (setup_.disableReadImage_) {
|
||||
@@ -219,14 +223,16 @@ bool DmaBlitManager::writeMemoryStaged(const void* srcHost, Memory& dstMemory, M
|
||||
bool DmaBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemory,
|
||||
const amd::Coord3D& origin, const amd::Coord3D& size,
|
||||
bool entire) const {
|
||||
// HSA copy functionality with a possible async operaiton, hence make sure GPU is done
|
||||
gpu().releaseGpuMemoryFence();
|
||||
|
||||
// Use host copy if memory has direct access
|
||||
if (setup_.disableWriteBuffer_ || dstMemory.isHostMemDirectAccess() ||
|
||||
gpuMem(dstMemory).IsPersistentDirectMap()) {
|
||||
// Stall GPU before CPU access
|
||||
gpu().releaseGpuMemoryFence();
|
||||
return HostBlitManager::writeBuffer(srcHost, dstMemory, origin, size, entire);
|
||||
} else {
|
||||
// HSA copy functionality with a possible async operation
|
||||
gpu().releaseGpuMemoryFence(kIgnoreBarrier, kSkipCpuWait);
|
||||
|
||||
size_t dstSize = size[0];
|
||||
size_t tmpSize = 0;
|
||||
size_t offset = 0;
|
||||
@@ -309,7 +315,7 @@ bool DmaBlitManager::writeBufferRect(const void* srcHost, device::Memory& dstMem
|
||||
const amd::BufferRect& hostRect,
|
||||
const amd::BufferRect& bufRect, const amd::Coord3D& size,
|
||||
bool entire) const {
|
||||
// HSA copy functionality with a possible async operaiton, hence make sure GPU is done
|
||||
// HSA copy functionality with a possible async operation
|
||||
gpu().releaseGpuMemoryFence();
|
||||
|
||||
// Use host copy if memory has direct access
|
||||
@@ -347,7 +353,7 @@ bool DmaBlitManager::writeBufferRect(const void* srcHost, device::Memory& dstMem
|
||||
bool DmaBlitManager::writeImage(const void* srcHost, device::Memory& dstMemory,
|
||||
const amd::Coord3D& origin, const amd::Coord3D& size,
|
||||
size_t rowPitch, size_t slicePitch, bool entire) const {
|
||||
// HSA copy functionality with a possible async operaiton, hence make sure GPU is done
|
||||
// HSA copy functionality with a possible async operation
|
||||
gpu().releaseGpuMemoryFence();
|
||||
|
||||
if (setup_.disableWriteImage_) {
|
||||
@@ -365,12 +371,11 @@ bool DmaBlitManager::writeImage(const void* srcHost, device::Memory& dstMemory,
|
||||
bool DmaBlitManager::copyBuffer(device::Memory& srcMemory, device::Memory& dstMemory,
|
||||
const amd::Coord3D& srcOrigin, const amd::Coord3D& dstOrigin,
|
||||
const amd::Coord3D& size, bool entire) const {
|
||||
// HSA copy functionality with a possible async operaiton, hence make sure GPU is done
|
||||
gpu().releaseGpuMemoryFence();
|
||||
|
||||
if (setup_.disableCopyBuffer_ ||
|
||||
(srcMemory.isHostMemDirectAccess() && !srcMemory.isCpuUncached() &&
|
||||
(dev().agent_profile() != HSA_PROFILE_FULL) && dstMemory.isHostMemDirectAccess())) {
|
||||
// Stall GPU before CPU access
|
||||
gpu().releaseGpuMemoryFence();
|
||||
return HostBlitManager::copyBuffer(srcMemory, dstMemory, srcOrigin, dstOrigin, size);
|
||||
} else {
|
||||
return hsaCopy(gpuMem(srcMemory), gpuMem(dstMemory), srcOrigin, dstOrigin, size);
|
||||
@@ -383,14 +388,14 @@ bool DmaBlitManager::copyBuffer(device::Memory& srcMemory, device::Memory& dstMe
|
||||
bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& dstMemory,
|
||||
const amd::BufferRect& srcRect, const amd::BufferRect& dstRect,
|
||||
const amd::Coord3D& size, bool entire) const {
|
||||
// HSA copy functionality with a possible async operaiton, hence make sure GPU is done
|
||||
gpu().releaseGpuMemoryFence();
|
||||
|
||||
if (setup_.disableCopyBufferRect_ ||
|
||||
(srcMemory.isHostMemDirectAccess() && !srcMemory.isCpuUncached() &&
|
||||
dstMemory.isHostMemDirectAccess())) {
|
||||
// Stall GPU before CPU access
|
||||
gpu().releaseGpuMemoryFence();
|
||||
return HostBlitManager::copyBufferRect(srcMemory, dstMemory, srcRect, dstRect, size, entire);
|
||||
} else {
|
||||
gpu().releaseGpuMemoryFence(kIgnoreBarrier, kSkipCpuWait);
|
||||
|
||||
void* src = gpuMem(srcMemory).getDeviceMemory();
|
||||
void* dst = gpuMem(dstMemory).getDeviceMemory();
|
||||
@@ -436,25 +441,21 @@ bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& d
|
||||
}
|
||||
|
||||
if (isSubwindowRectCopy ) {
|
||||
hsa_signal_store_relaxed(completion_signal_, kInitSignalValueOne);
|
||||
hsa_signal_t wait = gpu().Barriers().WaitSignal();
|
||||
hsa_signal_t active = gpu().Barriers().ActiveSignal(kInitSignalValueOne, gpu().timestamp());
|
||||
|
||||
// Copy memory line by line
|
||||
hsa_status_t status =
|
||||
hsa_amd_memory_async_copy_rect(&dstMem, &offset, &srcMem, &offset, &dim, agent,
|
||||
direction, 0, nullptr, completion_signal_);
|
||||
hsa_status_t status = hsa_amd_memory_async_copy_rect(&dstMem, &offset,
|
||||
&srcMem, &offset, &dim, agent, direction, 1, &wait, active);
|
||||
if (status != HSA_STATUS_SUCCESS) {
|
||||
LogPrintfError("DMA buffer failed with code %d", status);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!WaitForSignal(completion_signal_)) {
|
||||
LogError("Async copy failed");
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
// Fall to line by line copies
|
||||
const hsa_signal_value_t kInitVal = size[2] * size[1];
|
||||
hsa_signal_store_relaxed(completion_signal_, kInitVal);
|
||||
hsa_signal_t wait = gpu().Barriers().WaitSignal();
|
||||
hsa_signal_t active = gpu().Barriers().ActiveSignal(kInitVal, gpu().timestamp());
|
||||
|
||||
for (size_t z = 0; z < size[2]; ++z) {
|
||||
for (size_t y = 0; y < size[1]; ++y) {
|
||||
@@ -462,10 +463,10 @@ bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& d
|
||||
size_t dstOffset = dstRect.offset(0, y, z);
|
||||
|
||||
// Copy memory line by line
|
||||
hsa_status_t status =
|
||||
hsa_amd_memory_async_copy((reinterpret_cast<address>(dst) + dstOffset), dstAgent,
|
||||
(reinterpret_cast<const_address>(src) + srcOffset), srcAgent,
|
||||
size[0], 0, nullptr, completion_signal_);
|
||||
hsa_status_t status = hsa_amd_memory_async_copy(
|
||||
(reinterpret_cast<address>(dst) + dstOffset), dstAgent,
|
||||
(reinterpret_cast<const_address>(src) + srcOffset), srcAgent,
|
||||
size[0], 1, &wait, active);
|
||||
gpu().setLastCommandSDMA(true) ;
|
||||
if (status != HSA_STATUS_SUCCESS) {
|
||||
LogPrintfError("DMA buffer failed with code %d", status);
|
||||
@@ -473,14 +474,10 @@ bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& d
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!WaitForSignal(completion_signal_)) {
|
||||
LogError("Async copy failed");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
// Explicit wait for now, until runtime could distinguish compute and sdma operations
|
||||
gpu().Barriers().WaitCurrent();
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -489,12 +486,9 @@ bool DmaBlitManager::copyImageToBuffer(device::Memory& srcMemory, device::Memory
|
||||
const amd::Coord3D& srcOrigin, const amd::Coord3D& dstOrigin,
|
||||
const amd::Coord3D& size, bool entire, size_t rowPitch,
|
||||
size_t slicePitch) const {
|
||||
// HSA copy functionality with a possible async operaiton, hence make sure GPU is done
|
||||
if (!dev().settings().barrier_sync_ && !gpu().isLastCommandSDMA()) {
|
||||
gpu().releaseGpuMemoryFence(true);
|
||||
} else {
|
||||
gpu().releaseGpuMemoryFence();
|
||||
}
|
||||
// HSA copy functionality with a possible async operation, hence make sure GPU is done
|
||||
bool force_barrier = !dev().settings().barrier_sync_ && !gpu().isLastCommandSDMA();
|
||||
gpu().releaseGpuMemoryFence(force_barrier);
|
||||
|
||||
bool result = false;
|
||||
|
||||
@@ -504,9 +498,6 @@ bool DmaBlitManager::copyImageToBuffer(device::Memory& srcMemory, device::Memory
|
||||
} else {
|
||||
Image& srcImage = static_cast<roc::Image&>(srcMemory);
|
||||
Buffer& dstBuffer = static_cast<roc::Buffer&>(dstMemory);
|
||||
|
||||
// Use ROC path for a transfer
|
||||
// Note: it doesn't support SDMA
|
||||
address dstHost = reinterpret_cast<address>(dstBuffer.getDeviceMemory()) + dstOrigin[0];
|
||||
|
||||
// Use ROCm path for a transfer.
|
||||
@@ -540,12 +531,9 @@ bool DmaBlitManager::copyBufferToImage(device::Memory& srcMemory, device::Memory
|
||||
const amd::Coord3D& srcOrigin, const amd::Coord3D& dstOrigin,
|
||||
const amd::Coord3D& size, bool entire, size_t rowPitch,
|
||||
size_t slicePitch) const {
|
||||
// HSA copy functionality with a possible async operaiton, hence make sure GPU is done
|
||||
if (!dev().settings().barrier_sync_ && !gpu().isLastCommandSDMA()) {
|
||||
gpu().releaseGpuMemoryFence(true);
|
||||
} else {
|
||||
gpu().releaseGpuMemoryFence();
|
||||
}
|
||||
// HSA copy functionality with a possible async operation, hence make sure GPU is done
|
||||
bool force_barrier = !dev().settings().barrier_sync_ && !gpu().isLastCommandSDMA();
|
||||
gpu().releaseGpuMemoryFence(force_barrier);
|
||||
|
||||
bool result = false;
|
||||
|
||||
@@ -588,7 +576,7 @@ bool DmaBlitManager::copyBufferToImage(device::Memory& srcMemory, device::Memory
|
||||
bool DmaBlitManager::copyImage(device::Memory& srcMemory, device::Memory& dstMemory,
|
||||
const amd::Coord3D& srcOrigin, const amd::Coord3D& dstOrigin,
|
||||
const amd::Coord3D& size, bool entire) const {
|
||||
// HSA copy functionality with a possible async operaiton, hence make sure GPU is done
|
||||
// HSA copy functionality with a possible async operation, hence make sure GPU is done
|
||||
gpu().releaseGpuMemoryFence();
|
||||
|
||||
bool result = false;
|
||||
@@ -610,9 +598,8 @@ bool DmaBlitManager::hsaCopy(const Memory& srcMemory, const Memory& dstMemory,
|
||||
address src = reinterpret_cast<address>(srcMemory.getDeviceMemory());
|
||||
address dst = reinterpret_cast<address>(dstMemory.getDeviceMemory());
|
||||
|
||||
if (!dev().settings().barrier_sync_ && !gpu().isLastCommandSDMA()) {
|
||||
gpu().releaseGpuMemoryFence(true);
|
||||
}
|
||||
bool force_barrier = !dev().settings().barrier_sync_ && !gpu().isLastCommandSDMA();
|
||||
gpu().releaseGpuMemoryFence(force_barrier, kSkipCpuWait);
|
||||
|
||||
src += srcOrigin[0];
|
||||
dst += dstOrigin[0];
|
||||
@@ -620,6 +607,8 @@ bool DmaBlitManager::hsaCopy(const Memory& srcMemory, const Memory& dstMemory,
|
||||
// Just call copy function for full profile
|
||||
hsa_status_t status;
|
||||
if (dev().agent_profile() == HSA_PROFILE_FULL) {
|
||||
// Stall GPU, sicne CPU copy is possible
|
||||
gpu().Barriers().WaitCurrent();
|
||||
status = hsa_memory_copy(dst, src, size[0]);
|
||||
if (status != HSA_STATUS_SUCCESS) {
|
||||
LogPrintfError("Hsa copy of data failed with code %d", status);
|
||||
@@ -649,21 +638,15 @@ bool DmaBlitManager::hsaCopy(const Memory& srcMemory, const Memory& dstMemory,
|
||||
srcAgent = dstAgent = dev().getBackendDevice();
|
||||
}
|
||||
|
||||
hsa_signal_store_relaxed(completion_signal_, kInitSignalValueOne);
|
||||
|
||||
hsa_signal_t wait = gpu().Barriers().WaitSignal();
|
||||
hsa_signal_t active = gpu().Barriers().ActiveSignal(kInitSignalValueOne, gpu().timestamp());
|
||||
// Use SDMA to transfer the data
|
||||
status = hsa_amd_memory_async_copy(dst, dstAgent, src, srcAgent, size[0], 0, nullptr,
|
||||
completion_signal_);
|
||||
status = hsa_amd_memory_async_copy(dst, dstAgent, src, srcAgent, size[0], 1, &wait, active);
|
||||
gpu().setLastCommandSDMA(true);
|
||||
// Explicit wait for now, until runtime could distinguish compute and sdma operations
|
||||
gpu().Barriers().WaitCurrent();
|
||||
if (status == HSA_STATUS_SUCCESS) {
|
||||
hsa_signal_value_t val;
|
||||
|
||||
if (!WaitForSignal(completion_signal_)) {
|
||||
LogError("Async copy failed");
|
||||
status = HSA_STATUS_ERROR;
|
||||
} else {
|
||||
gpu().addSystemScope();
|
||||
}
|
||||
gpu().addSystemScope();
|
||||
} else {
|
||||
LogPrintfError("Hsa copy from host to device failed with code %d", status);
|
||||
}
|
||||
@@ -674,6 +657,10 @@ bool DmaBlitManager::hsaCopy(const Memory& srcMemory, const Memory& dstMemory,
|
||||
// ================================================================================================
|
||||
bool DmaBlitManager::hsaCopyStaged(const_address hostSrc, address hostDst, size_t size,
|
||||
address staging, bool hostToDev) const {
|
||||
// Stall GPU, sicne CPU copy is possible
|
||||
bool force_barrier = !dev().settings().barrier_sync_ && !gpu().isLastCommandSDMA();
|
||||
gpu().releaseGpuMemoryFence(force_barrier);
|
||||
|
||||
// No allocation is necessary for Full Profile
|
||||
hsa_status_t status;
|
||||
if (dev().agent_profile() == HSA_PROFILE_FULL) {
|
||||
@@ -688,14 +675,11 @@ bool DmaBlitManager::hsaCopyStaged(const_address hostSrc, address hostDst, size_
|
||||
size_t offset = 0;
|
||||
|
||||
address hsaBuffer = staging;
|
||||
if (!dev().settings().barrier_sync_ && !gpu().isLastCommandSDMA()) {
|
||||
gpu().releaseGpuMemoryFence(true);
|
||||
}
|
||||
|
||||
// Allocate requested size of memory
|
||||
while (totalSize > 0) {
|
||||
size = std::min(totalSize, dev().settings().stagedXferSize_);
|
||||
hsa_signal_silent_store_relaxed(completion_signal_, kInitSignalValueOne);
|
||||
hsa_signal_t active = gpu().Barriers().ActiveSignal(kInitSignalValueOne, gpu().timestamp());
|
||||
|
||||
// Copy data from Host to Device
|
||||
if (hostToDev) {
|
||||
@@ -707,17 +691,13 @@ bool DmaBlitManager::hsaCopyStaged(const_address hostSrc, address hostDst, size_
|
||||
|
||||
memcpy(hsaBuffer, hostSrc + offset, size);
|
||||
status = hsa_amd_memory_async_copy(hostDst + offset, dev().getBackendDevice(), hsaBuffer,
|
||||
srcAgent, size, 0, nullptr, completion_signal_);
|
||||
srcAgent, size, 0, nullptr, active);
|
||||
gpu().setLastCommandSDMA(true);
|
||||
if (status == HSA_STATUS_SUCCESS) {
|
||||
if (!WaitForSignal(completion_signal_)) {
|
||||
LogError("Async copy failed");
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
if (status != HSA_STATUS_SUCCESS) {
|
||||
LogPrintfError("Hsa copy from host to device failed with code %d", status);
|
||||
return false;
|
||||
}
|
||||
gpu().Barriers().WaitCurrent();
|
||||
totalSize -= size;
|
||||
offset += size;
|
||||
continue;
|
||||
@@ -730,15 +710,11 @@ bool DmaBlitManager::hsaCopyStaged(const_address hostSrc, address hostDst, size_
|
||||
(size <= dev().settings().sdmaCopyThreshold_) ? dev().getBackendDevice() : dev().getCpuAgent();
|
||||
|
||||
// Copy data from Device to Host
|
||||
status =
|
||||
hsa_amd_memory_async_copy(hsaBuffer, dstAgent, hostSrc + offset,
|
||||
dev().getBackendDevice(), size, 0, nullptr, completion_signal_);
|
||||
status = hsa_amd_memory_async_copy(hsaBuffer, dstAgent, hostSrc + offset,
|
||||
dev().getBackendDevice(), size, 0, nullptr, active);
|
||||
gpu().setLastCommandSDMA(true);
|
||||
if (status == HSA_STATUS_SUCCESS) {
|
||||
if (!WaitForSignal(completion_signal_)) {
|
||||
LogError("Async copy failed");
|
||||
return false;
|
||||
}
|
||||
gpu().Barriers().WaitCurrent();
|
||||
memcpy(hostDst + offset, hsaBuffer, size);
|
||||
} else {
|
||||
LogPrintfError("Hsa copy from device to host failed with code %d", status);
|
||||
@@ -1083,11 +1059,7 @@ bool KernelBlitManager::copyBufferToImageKernel(device::Memory& srcMemory,
|
||||
releaseArguments(parameters);
|
||||
if (releaseView) {
|
||||
// todo SRD programming could be changed to avoid a stall
|
||||
if(!dev().settings().barrier_sync_) {
|
||||
gpu().releaseGpuMemoryFence(true);
|
||||
} else {
|
||||
gpu().releaseGpuMemoryFence();
|
||||
}
|
||||
gpu().releaseGpuMemoryFence();
|
||||
dstView->owner()->release();
|
||||
}
|
||||
|
||||
@@ -1285,11 +1257,7 @@ bool KernelBlitManager::copyImageToBufferKernel(device::Memory& srcMemory,
|
||||
releaseArguments(parameters);
|
||||
if (releaseView) {
|
||||
// todo SRD programming could be changed to avoid a stall
|
||||
if(!dev().settings().barrier_sync_) {
|
||||
gpu().releaseGpuMemoryFence(true);
|
||||
} else {
|
||||
gpu().releaseGpuMemoryFence();
|
||||
}
|
||||
gpu().releaseGpuMemoryFence();
|
||||
srcView->owner()->release();
|
||||
}
|
||||
|
||||
@@ -1465,6 +1433,8 @@ bool KernelBlitManager::readImage(device::Memory& srcMemory, void* dstHost,
|
||||
|
||||
// Use host copy if memory has direct access
|
||||
if (setup_.disableReadImage_ || (srcMemory.isHostMemDirectAccess() && !srcMemory.isCpuUncached())) {
|
||||
// Stall GPU before CPU access
|
||||
gpu().releaseGpuMemoryFence();
|
||||
result = HostBlitManager::readImage(srcMemory, dstHost, origin, size, rowPitch, slicePitch, entire);
|
||||
synchronize();
|
||||
return result;
|
||||
@@ -1510,6 +1480,8 @@ bool KernelBlitManager::writeImage(const void* srcHost, device::Memory& dstMemor
|
||||
|
||||
// Use host copy if memory has direct access
|
||||
if (setup_.disableWriteImage_ || dstMemory.isHostMemDirectAccess()) {
|
||||
// Stall GPU before CPU access
|
||||
gpu().releaseGpuMemoryFence();
|
||||
result = HostBlitManager::writeImage(srcHost, dstMemory, origin, size, rowPitch, slicePitch, entire);
|
||||
synchronize();
|
||||
return result;
|
||||
@@ -1704,6 +1676,8 @@ bool KernelBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost,
|
||||
|
||||
// Use host copy if memory has direct access
|
||||
if (setup_.disableReadBuffer_ || (srcMemory.isHostMemDirectAccess() && !srcMemory.isCpuUncached())) {
|
||||
// Stall GPU before CPU access
|
||||
gpu().releaseGpuMemoryFence();
|
||||
result = HostBlitManager::readBuffer(srcMemory, dstHost, origin, size, entire);
|
||||
synchronize();
|
||||
return result;
|
||||
@@ -1753,6 +1727,8 @@ bool KernelBlitManager::readBufferRect(device::Memory& srcMemory, void* dstHost,
|
||||
// Use host copy if memory has direct access
|
||||
if (setup_.disableReadBufferRect_ ||
|
||||
(srcMemory.isHostMemDirectAccess() && !srcMemory.isCpuUncached())) {
|
||||
// Stall GPU before CPU access
|
||||
gpu().releaseGpuMemoryFence();
|
||||
result = HostBlitManager::readBufferRect(srcMemory, dstHost, bufRect, hostRect, size, entire);
|
||||
synchronize();
|
||||
return result;
|
||||
@@ -1814,6 +1790,8 @@ bool KernelBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemo
|
||||
// Use host copy if memory has direct access
|
||||
if (setup_.disableWriteBuffer_ || dstMemory.isHostMemDirectAccess() ||
|
||||
gpuMem(dstMemory).IsPersistentDirectMap()) {
|
||||
// Stall GPU before CPU access
|
||||
gpu().releaseGpuMemoryFence();
|
||||
result = HostBlitManager::writeBuffer(srcHost, dstMemory, origin, size, entire);
|
||||
synchronize();
|
||||
return result;
|
||||
@@ -1864,6 +1842,8 @@ bool KernelBlitManager::writeBufferRect(const void* srcHost, device::Memory& dst
|
||||
// Use host copy if memory has direct access
|
||||
if (setup_.disableWriteBufferRect_ || dstMemory.isHostMemDirectAccess() ||
|
||||
gpuMem(dstMemory).IsPersistentDirectMap()) {
|
||||
// Stall GPU before CPU access
|
||||
gpu().releaseGpuMemoryFence();
|
||||
result = HostBlitManager::writeBufferRect(srcHost, dstMemory, hostRect, bufRect, size, entire);
|
||||
synchronize();
|
||||
return result;
|
||||
@@ -1913,6 +1893,8 @@ bool KernelBlitManager::fillBuffer(device::Memory& memory, const void* pattern,
|
||||
|
||||
// Use host fill if memory has direct access
|
||||
if (setup_.disableFillBuffer_ || memory.isHostMemDirectAccess()) {
|
||||
// Stall GPU before CPU access
|
||||
gpu().releaseGpuMemoryFence();
|
||||
result = HostBlitManager::fillBuffer(memory, pattern, patternSize, origin, size, entire);
|
||||
synchronize();
|
||||
return result;
|
||||
@@ -2074,6 +2056,8 @@ bool KernelBlitManager::fillImage(device::Memory& memory, const void* pattern,
|
||||
|
||||
// Use host fill if memory has direct access
|
||||
if (setup_.disableFillImage_ || memory.isHostMemDirectAccess()) {
|
||||
// Stall GPU before CPU access
|
||||
gpu().releaseGpuMemoryFence();
|
||||
result = HostBlitManager::fillImage(memory, pattern, origin, size, entire);
|
||||
synchronize();
|
||||
return result;
|
||||
|
||||
@@ -40,6 +40,9 @@ class Kernel;
|
||||
class Memory;
|
||||
class VirtualGPU;
|
||||
|
||||
constexpr bool kSkipCpuWait = true;
|
||||
constexpr bool kIgnoreBarrier = false;
|
||||
|
||||
//! DMA Blit Manager
|
||||
class DmaBlitManager : public device::HostBlitManager {
|
||||
public:
|
||||
@@ -49,19 +52,10 @@ class DmaBlitManager : public device::HostBlitManager {
|
||||
);
|
||||
|
||||
//! Destructor
|
||||
virtual ~DmaBlitManager() {
|
||||
if (completion_signal_.handle != 0) {
|
||||
hsa_signal_destroy(completion_signal_);
|
||||
}
|
||||
}
|
||||
virtual ~DmaBlitManager() {}
|
||||
|
||||
//! Creates DmaBlitManager object
|
||||
virtual bool create(amd::Device& device) {
|
||||
if (HSA_STATUS_SUCCESS != hsa_signal_create(0, 0, nullptr, &completion_signal_)) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
virtual bool create(amd::Device& device) { return true; }
|
||||
|
||||
//! Copies a buffer object to system memory
|
||||
virtual bool readBuffer(device::Memory& srcMemory, //!< Source memory object
|
||||
@@ -225,9 +219,6 @@ class DmaBlitManager : public device::HostBlitManager {
|
||||
size_t xferSize //!< Transfer size
|
||||
) const;
|
||||
|
||||
//! Handle of ROC Device object
|
||||
hsa_signal_t completion_signal_;
|
||||
|
||||
//! Assits in transferring data from Host to Local or vice versa
|
||||
//! taking into account the Hsail profile supported by Hsa Agent
|
||||
bool hsaCopyStaged(const_address hostSrc, //!< Contains source data to be copied
|
||||
|
||||
@@ -260,7 +260,12 @@ Device::~Device() {
|
||||
context().svmFree(coopHostcallBuffer_);
|
||||
coopHostcallBuffer_ = nullptr;
|
||||
}
|
||||
|
||||
if (0 != prefetch_signal_.handle) {
|
||||
hsa_signal_destroy(prefetch_signal_);
|
||||
}
|
||||
}
|
||||
|
||||
bool NullDevice::initCompiler(bool isOffline) {
|
||||
#if defined(WITH_COMPILER_LIB)
|
||||
// Initialize the compiler handle if has already not been initialized
|
||||
|
||||
@@ -199,8 +199,9 @@ void Memory::cpuUnmap(device::VirtualDevice& vDev) {
|
||||
amd::Coord3D(size()), true)) {
|
||||
LogError("[OCL] Fail sync the device memory on cpuUnmap");
|
||||
}
|
||||
// Wait on CPU for the transfer
|
||||
static_cast<roc::VirtualGPU&>(vDev).releaseGpuMemoryFence();
|
||||
}
|
||||
|
||||
decIndMapCount();
|
||||
}
|
||||
|
||||
|
||||
@@ -456,34 +456,12 @@ bool VirtualGPU::dispatchGenericAqlPacket(
|
||||
// Check for queue full and wait if needed.
|
||||
uint64_t index = hsa_queue_add_write_index_screlease(gpu_queue_, size);
|
||||
uint64_t read = hsa_queue_load_read_index_relaxed(gpu_queue_);
|
||||
hsa_signal_t signal;
|
||||
|
||||
// TODO: placeholder to setup the kernel to populate start and end timestamp.
|
||||
if (timestamp_ != nullptr) {
|
||||
// Pool size must grow to the size of pending AQL packets
|
||||
const uint32_t pool_size = index - read;
|
||||
if (pool_size >= signal_pool_.size()) {
|
||||
ProfilingSignal profiling_signal = {};
|
||||
if (HSA_STATUS_SUCCESS != hsa_signal_create(0, 0, nullptr, &profiling_signal.signal_)) {
|
||||
LogPrintfError("Failed signal allocation id = %d", pool_size);
|
||||
return false;
|
||||
}
|
||||
signal_pool_.push_back(profiling_signal);
|
||||
assert(queueSize >= signal_pool_.size() && "Pool will be reallocated!");
|
||||
}
|
||||
// Move index inside the valid pool
|
||||
++current_signal_ %= signal_pool_.size();
|
||||
// Find signal slot
|
||||
ProfilingSignal* profilingSignal = &signal_pool_[current_signal_];
|
||||
// Make sure we save the old results in the TS structure
|
||||
if (profilingSignal->ts_ != nullptr) {
|
||||
profilingSignal->ts_->checkGpuTime();
|
||||
}
|
||||
// Update the new TS with the signal info
|
||||
timestamp_->setProfilingSignal(profilingSignal);
|
||||
packet->completion_signal = profilingSignal->signal_;
|
||||
profilingSignal->ts_ = timestamp_;
|
||||
timestamp_->setAgent(gpu_device_);
|
||||
// Get active signal for current dispatch if profiling is necessary
|
||||
packet->completion_signal = Barriers().ActiveSignal(kInitSignalValueOne, timestamp_, pool_size);
|
||||
}
|
||||
|
||||
// Make sure the slot is free for usage
|
||||
@@ -494,23 +472,11 @@ bool VirtualGPU::dispatchGenericAqlPacket(
|
||||
// Add blocking command if the original value of read index was behind of the queue size
|
||||
if (blocking || (index - read) >= queueMask) {
|
||||
if (packet->completion_signal.handle == 0) {
|
||||
packet->completion_signal = barrier_signal_;
|
||||
packet->completion_signal = Barriers().ActiveSignal();
|
||||
}
|
||||
signal = packet->completion_signal;
|
||||
// Initialize signal for a wait
|
||||
hsa_signal_store_relaxed(signal, kInitSignalValueOne);
|
||||
blocking = true;
|
||||
}
|
||||
|
||||
// If runtime doesn't use the barrier, then make sure it tracks the last submitted command
|
||||
if (!dev().settings().barrier_sync_) {
|
||||
// Initialize signal for a wait
|
||||
assert(packet->completion_signal.handle != 0 &&
|
||||
"There is no HSA signal associated with the last command!");
|
||||
hsa_signal_store_relaxed(packet->completion_signal, kInitSignalValueOne);
|
||||
last_signal_ = packet->completion_signal;
|
||||
}
|
||||
|
||||
// Insert packet(s)
|
||||
// NOTE: need multiple packets to dispatch the performance counter
|
||||
// packet blob of the legacy devices (gfx8)
|
||||
@@ -550,12 +516,10 @@ bool VirtualGPU::dispatchGenericAqlPacket(
|
||||
|
||||
// Wait on signal ?
|
||||
if (blocking) {
|
||||
if (!WaitForSignal(signal)) {
|
||||
LogPrintfError("Failed signal [0x%lx] wait", signal.handle);
|
||||
if (!Barriers().WaitCurrent()) {
|
||||
LogPrintfError("Failed blocking queue wait with signal [0x%lx]", packet->completion_signal.handle);
|
||||
return false;
|
||||
}
|
||||
// Reset the pool of signals
|
||||
current_signal_ = 0;
|
||||
}
|
||||
|
||||
return true;
|
||||
@@ -601,6 +565,8 @@ bool VirtualGPU::dispatchCounterAqlPacket(hsa_ext_amd_aql_pm4_packet_t* packet,
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
void VirtualGPU::dispatchBarrierPacket(const hsa_barrier_and_packet_t* packet) {
|
||||
assert(packet->completion_signal.handle != 0);
|
||||
const uint32_t queueSize = gpu_queue_->size;
|
||||
@@ -631,6 +597,7 @@ void VirtualGPU::dispatchBarrierPacket(const hsa_barrier_and_packet_t* packet) {
|
||||
packet->dep_signal[3], packet->dep_signal[4], packet->completion_signal);
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
void VirtualGPU::dispatchGenericBarrierPacket(hsa_barrier_and_packet_t* packet,
|
||||
uint16_t packetHeader, hsa_signal_t signal) {
|
||||
const uint32_t queueSize = gpu_queue_->size;
|
||||
@@ -641,30 +608,8 @@ void VirtualGPU::dispatchGenericBarrierPacket(hsa_barrier_and_packet_t* packet,
|
||||
if (signal.handle == 0) {
|
||||
// Pool size must grow to the size of pending AQL packets
|
||||
const uint32_t pool_size = index - read;
|
||||
if (pool_size >= signal_pool_.size()) {
|
||||
ProfilingSignal profiling_signal = {};
|
||||
if (HSA_STATUS_SUCCESS != hsa_signal_create(0, 0, nullptr, &profiling_signal.signal_)) {
|
||||
LogPrintfError("Failed signal allocation id = %d", pool_size);
|
||||
}
|
||||
signal_pool_.push_back(profiling_signal);
|
||||
assert(queueSize >= signal_pool_.size() && "Pool will be reallocated!");
|
||||
}
|
||||
// Move index inside the valid pool
|
||||
++current_signal_ %= signal_pool_.size();
|
||||
// Find signal slot
|
||||
ProfilingSignal* profilingSignal = &signal_pool_[current_signal_];
|
||||
// Make sure we save the old results in the TS structure
|
||||
if (profilingSignal->ts_ != nullptr) {
|
||||
profilingSignal->ts_->checkGpuTime();
|
||||
}
|
||||
if (timestamp_ != nullptr) {
|
||||
// Update the new TS with the signal info
|
||||
timestamp_->setProfilingSignal(profilingSignal);
|
||||
profilingSignal->ts_ = timestamp_;
|
||||
timestamp_->setAgent(gpu_device_);
|
||||
}
|
||||
packet->completion_signal = profilingSignal->signal_;
|
||||
hsa_signal_store_relaxed(profilingSignal->signal_, kInitSignalValueOne);
|
||||
// Get active signal for current dispatch if profiling is necessary
|
||||
packet->completion_signal = Barriers().ActiveSignal(kInitSignalValueOne, timestamp_, pool_size);
|
||||
} else {
|
||||
assert(signal.handle != 0);
|
||||
packet->completion_signal = signal;
|
||||
@@ -705,44 +650,25 @@ void VirtualGPU::ResetQueueStates() {
|
||||
// Release the pool, since runtime just completed a barrier
|
||||
// @note: Runtime can reset kernel arg pool only if the barrier with L2 invalidation was issued
|
||||
resetKernArgPool();
|
||||
} else {
|
||||
// Reset the pool of signals
|
||||
current_signal_ = 0;
|
||||
}
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
bool VirtualGPU::releaseGpuMemoryFence(bool force_barrier) {
|
||||
// Return if there is no pending dispatch
|
||||
if (!hasPendingDispatch_) {
|
||||
if (dev().settings().barrier_sync_ || !force_barrier) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
hsa_signal_t wait_signal = barrier_signal_;
|
||||
bool VirtualGPU::releaseGpuMemoryFence(bool force_barrier, bool skip_cpu_wait) {
|
||||
if (hasPendingDispatch_ && (dev().settings().barrier_sync_ || force_barrier)) {
|
||||
barrier_packet_.completion_signal = Barriers().ActiveSignal();
|
||||
|
||||
// If barrier sync was requested or runtime didn't provide the last signal
|
||||
if (dev().settings().barrier_sync_ || force_barrier) {
|
||||
// Initialize signal for the barrier packet.
|
||||
hsa_signal_store_relaxed(barrier_signal_, kInitSignalValueOne);
|
||||
|
||||
// Dispatch barrier packet into the queue and wait till it finishes.
|
||||
// Dispatch barrier packet into the queue
|
||||
dispatchBarrierPacket(&barrier_packet_);
|
||||
}
|
||||
else {
|
||||
// Take the signal of the last submitted dispatch
|
||||
wait_signal = last_signal_;
|
||||
hasPendingDispatch_ = false;
|
||||
}
|
||||
|
||||
// Wait for compute work previously submitted
|
||||
if (!WaitForSignal(wait_signal)) {
|
||||
LogError("Waiting for compute work failed!");
|
||||
return false;
|
||||
// Check if runtime could skip CPU wait
|
||||
if (!skip_cpu_wait) {
|
||||
Barriers().WaitCurrent();
|
||||
|
||||
ResetQueueStates();
|
||||
}
|
||||
|
||||
hasPendingDispatch_ = false;
|
||||
|
||||
ResetQueueStates();
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -800,7 +726,6 @@ VirtualGPU::VirtualGPU(Device& device, bool profiling, bool cooperative,
|
||||
}
|
||||
|
||||
aqlHeader_ = dispatchPacketHeader_;
|
||||
barrier_signal_.handle = 0;
|
||||
|
||||
// Note: Virtual GPU device creation must be a thread safe operation
|
||||
roc_device_.vgpus_.resize(roc_device_.numOfVgpus_);
|
||||
@@ -808,16 +733,13 @@ VirtualGPU::VirtualGPU(Device& device, bool profiling, bool cooperative,
|
||||
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
VirtualGPU::~VirtualGPU() {
|
||||
delete blitMgr_;
|
||||
|
||||
// Release the resources of signal
|
||||
releaseGpuMemoryFence();
|
||||
|
||||
if (barrier_signal_.handle != 0) {
|
||||
hsa_signal_destroy(barrier_signal_);
|
||||
}
|
||||
|
||||
destroyPool();
|
||||
|
||||
releasePinnedMem();
|
||||
@@ -868,8 +790,7 @@ bool VirtualGPU::create() {
|
||||
gpu_queue_ = roc_device_.acquireQueue(queue_size, cooperative_, cuMask_, priority_);
|
||||
if (!gpu_queue_) return false;
|
||||
|
||||
if (!initPool(dev().settings().kernargPoolSize_,
|
||||
(profiling_ || (amd::IS_HIP)) ? queue_size : 0)) {
|
||||
if (!initPool(dev().settings().kernargPoolSize_)) {
|
||||
LogError("Couldn't allocate arguments/signals for the queue");
|
||||
return false;
|
||||
}
|
||||
@@ -881,17 +802,9 @@ bool VirtualGPU::create() {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Create signal for the barrier packet.
|
||||
hsa_signal_t signal = {0};
|
||||
if (HSA_STATUS_SUCCESS != hsa_signal_create(kInitSignalValueOne, 0, nullptr, &signal)) {
|
||||
return false;
|
||||
}
|
||||
barrier_signal_ = signal;
|
||||
|
||||
// Initialize barrier packet.
|
||||
memset(&barrier_packet_, 0, sizeof(barrier_packet_));
|
||||
barrier_packet_.header = kInvalidAql;
|
||||
barrier_packet_.completion_signal = barrier_signal_;
|
||||
|
||||
// Create a object of PrintfDbg
|
||||
printfdbg_ = new PrintfDbg(roc_device_);
|
||||
@@ -912,62 +825,32 @@ bool VirtualGPU::create() {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Allocate signal tracker for ROCr copy queue
|
||||
if (!Barriers().Create(gpu_device())) {
|
||||
LogError("Could not create signal for copy queue!");
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool VirtualGPU::initPool(size_t kernarg_pool_size, uint signal_pool_count) {
|
||||
// ================================================================================================
|
||||
bool VirtualGPU::initPool(size_t kernarg_pool_size) {
|
||||
kernarg_pool_size_ = kernarg_pool_size;
|
||||
kernarg_pool_base_ = reinterpret_cast<char*>(roc_device_.hostAlloc(kernarg_pool_size_, false));
|
||||
if (kernarg_pool_base_ == nullptr) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Optimization :
|
||||
// For better resource utilization runtime should create them only when required
|
||||
// In case of HIP, Apps create short live streams which do not need more signals
|
||||
// hence starting with smaller number 32. There is code inplace to grow the pool
|
||||
// later when it is needed.
|
||||
bool forced_default_pool_sz = false;
|
||||
if (!profiling_ && (amd::IS_HIP)) {
|
||||
forced_default_pool_sz = true;
|
||||
}
|
||||
|
||||
if (signal_pool_count != 0) {
|
||||
// Reserve signal pool for all entries in the queue, since profiling logic will save the
|
||||
// pointer in timestamp info for the future references
|
||||
signal_pool_.reserve(signal_pool_count);
|
||||
// If barrier is disable, then allocate a small portion of all signals and grow the array later.
|
||||
// @note: the optimization requires a wait for signal on reuse, which is only available when
|
||||
// the barrier is disabled
|
||||
constexpr uint32_t kDefaultSignalPoolSize = 32;
|
||||
const uint32_t default_signal_pool_size =
|
||||
(dev().settings().barrier_sync_ && !forced_default_pool_sz) ?
|
||||
signal_pool_count : kDefaultSignalPoolSize;
|
||||
signal_pool_.resize(default_signal_pool_size);
|
||||
for (uint i = 0; i < default_signal_pool_size; ++i) {
|
||||
ProfilingSignal profilingSignal;
|
||||
if (HSA_STATUS_SUCCESS != hsa_signal_create(0, 0, nullptr, &profilingSignal.signal_)) {
|
||||
return false;
|
||||
}
|
||||
signal_pool_[i] = profilingSignal;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
void VirtualGPU::destroyPool() {
|
||||
if (kernarg_pool_base_ != nullptr) {
|
||||
roc_device_.hostFree(kernarg_pool_base_, kernarg_pool_size_);
|
||||
}
|
||||
|
||||
if (signal_pool_.size() > 0) {
|
||||
for (uint i = 0; i < signal_pool_.size(); ++i) {
|
||||
hsa_signal_destroy(signal_pool_[i].signal_);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
void* VirtualGPU::allocKernArg(size_t size, size_t alignment) {
|
||||
char* result = nullptr;
|
||||
do {
|
||||
@@ -982,24 +865,21 @@ void* VirtualGPU::allocKernArg(size_t size, size_t alignment) {
|
||||
//! We can issue a barrier to avoid expensive extra memory allocations.
|
||||
|
||||
// Initialize signal for the barrier packet.
|
||||
hsa_signal_store_relaxed(barrier_signal_, kInitSignalValueOne);
|
||||
barrier_packet_.completion_signal = Barriers().ActiveSignal();
|
||||
|
||||
// Dispatch barrier packet into the queue and wait till it finishes.
|
||||
dispatchBarrierPacket(&barrier_packet_);
|
||||
if (!WaitForSignal(barrier_signal_)) {
|
||||
if (!Barriers().WaitCurrent()) {
|
||||
LogError("Kernel arguments reset failed");
|
||||
}
|
||||
|
||||
resetKernArgPool();
|
||||
|
||||
// Reset the pool of signals
|
||||
current_signal_ = 0;
|
||||
}
|
||||
} while (true);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
/* profilingBegin, when profiling is enabled, creates a timestamp to save in
|
||||
* virtualgpu's timestamp_, and calls start() to get the current host
|
||||
* timestamp.
|
||||
@@ -1007,9 +887,8 @@ void* VirtualGPU::allocKernArg(size_t size, size_t alignment) {
|
||||
void VirtualGPU::profilingBegin(amd::Command& command, bool drmProfiling) {
|
||||
if (command.profilingInfo().enabled_) {
|
||||
if (timestamp_ != nullptr) {
|
||||
LogWarning(
|
||||
"Trying to create a second timestamp in VirtualGPU. \
|
||||
This could have unintended consequences.");
|
||||
LogWarning("Trying to create a second timestamp in VirtualGPU. \
|
||||
This could have unintended consequences.");
|
||||
return;
|
||||
}
|
||||
// Without barrier profiling will wait for each individual signal
|
||||
@@ -1018,6 +897,7 @@ void VirtualGPU::profilingBegin(amd::Command& command, bool drmProfiling) {
|
||||
}
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
/* profilingEnd, when profiling is enabled, checks to see if a signal was
|
||||
* created for whatever command we are running and calls end() to get the
|
||||
* current host timestamp if no signal is available. It then saves the pointer
|
||||
@@ -1033,6 +913,7 @@ void VirtualGPU::profilingEnd(amd::Command& command) {
|
||||
}
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
void VirtualGPU::updateCommandsState(amd::Command* list) {
|
||||
Timestamp* ts = nullptr;
|
||||
|
||||
@@ -1335,8 +1216,10 @@ void VirtualGPU::submitSvmFreeMemory(amd::SvmFreeMemoryCommand& cmd) {
|
||||
// ================================================================================================
|
||||
void VirtualGPU::submitSvmPrefetchAsync(amd::SvmPrefetchAsyncCommand& cmd) {
|
||||
#if AMD_HMM_SUPPORT
|
||||
profilingBegin(cmd);
|
||||
// Initialize signal for the barrier
|
||||
hsa_signal_store_relaxed(barrier_signal_, kInitSignalValueOne);
|
||||
hsa_signal_t wait = Barriers().WaitSignal();
|
||||
hsa_signal_t active = Barriers().ActiveSignal(kInitSignalValueOne, timestamp_);
|
||||
|
||||
// Find the requested agent for the transfer
|
||||
hsa_agent_t agent = (cmd.cpu_access() ||
|
||||
@@ -1345,16 +1228,18 @@ void VirtualGPU::submitSvmPrefetchAsync(amd::SvmPrefetchAsyncCommand& cmd) {
|
||||
|
||||
// Initiate a prefetch command
|
||||
hsa_status_t status = hsa_amd_svm_prefetch_async(
|
||||
const_cast<void*>(cmd.dev_ptr()), cmd.count(), agent, 0, nullptr, barrier_signal_);
|
||||
const_cast<void*>(cmd.dev_ptr()), cmd.count(), agent, 1, &wait, active);
|
||||
|
||||
// Wait for the prefetch
|
||||
if ((status != HSA_STATUS_SUCCESS) || !WaitForSignal(barrier_signal_)) {
|
||||
// Wait for the prefetch. Should skip wait, but may require extra tracking for kernel execution.
|
||||
if ((status != HSA_STATUS_SUCCESS) || !Barriers().WaitCurrent()) {
|
||||
LogError("hsa_amd_svm_prefetch_async failed");
|
||||
cmd.setStatus(CL_INVALID_OPERATION);
|
||||
}
|
||||
|
||||
// Add system scope, since the prefetch scope is unclear
|
||||
addSystemScope();
|
||||
|
||||
profilingEnd(cmd);
|
||||
#endif // AMD_HMM_SUPPORT
|
||||
}
|
||||
|
||||
@@ -2490,9 +2375,10 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
|
||||
}
|
||||
|
||||
if (gpuKernel.dynamicParallelism()) {
|
||||
barrier_packet_.completion_signal.handle = 0;
|
||||
dispatchBarrierPacket(&barrier_packet_);
|
||||
static_cast<KernelBlitManager&>(blitMgr()).runScheduler(
|
||||
getVQVirtualAddress(), schedulerParam_, schedulerQueue_, schedulerSignal_, schedulerThreads_);
|
||||
getVQVirtualAddress(), schedulerParam_, schedulerQueue_, schedulerSignal_, schedulerThreads_);
|
||||
}
|
||||
|
||||
// Check if image buffer write back is required
|
||||
@@ -2594,9 +2480,6 @@ void VirtualGPU::submitMarker(amd::Marker& vcmd) {
|
||||
uint16_t header = kNopPacketHeader;
|
||||
hsa_signal_t sig { 0 };
|
||||
dispatchGenericBarrierPacket(&barrier_packet_, header, sig);
|
||||
last_signal_ = barrier_packet_.completion_signal;
|
||||
// Restore barrier signal
|
||||
barrier_packet_.completion_signal = barrier_signal_;
|
||||
}
|
||||
profilingEnd(vcmd);
|
||||
}
|
||||
@@ -2618,8 +2501,8 @@ void VirtualGPU::submitReleaseExtObjects(amd::ReleaseExtObjectsCommand& vcmd) {
|
||||
profilingBegin(vcmd);
|
||||
if (!dev().settings().barrier_sync_) {
|
||||
// Force barrier to make sure L2 flush, since interop can be in sysmem
|
||||
constexpr bool ForceBarrier = true;
|
||||
releaseGpuMemoryFence(ForceBarrier);
|
||||
constexpr bool kForceBarrier = true;
|
||||
releaseGpuMemoryFence(kForceBarrier);
|
||||
}
|
||||
profilingEnd(vcmd);
|
||||
}
|
||||
@@ -2644,6 +2527,9 @@ void VirtualGPU::flush(amd::Command* list, bool wait) {
|
||||
|
||||
// ================================================================================================
|
||||
void VirtualGPU::addXferWrite(Memory& memory) {
|
||||
//! @note: ROCr backend doesn't have per resource busy tracking, hence runtime has to wait
|
||||
//! unconditionally, before it can release pinned memory
|
||||
releaseGpuMemoryFence();
|
||||
if (xferWriteBuffers_.size() > 7) {
|
||||
dev().xferWrite().release(*this, *xferWriteBuffers_.front());
|
||||
xferWriteBuffers_.erase(xferWriteBuffers_.begin());
|
||||
@@ -2653,6 +2539,7 @@ void VirtualGPU::addXferWrite(Memory& memory) {
|
||||
xferWriteBuffers_.push_back(&memory);
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
void VirtualGPU::releaseXferWrite() {
|
||||
for (auto& memory : xferWriteBuffers_) {
|
||||
dev().xferWrite().release(*this, *memory);
|
||||
@@ -2660,7 +2547,11 @@ void VirtualGPU::releaseXferWrite() {
|
||||
xferWriteBuffers_.resize(0);
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
void VirtualGPU::addPinnedMem(amd::Memory* mem) {
|
||||
//! @note: ROCr backend doesn't have per resource busy tracking, hence runtime has to wait
|
||||
//! unconditionally, before it can release pinned memory
|
||||
releaseGpuMemoryFence();
|
||||
if (nullptr == findPinnedMem(mem->getHostMem(), mem->getSize())) {
|
||||
if (pinnedMems_.size() > 7) {
|
||||
pinnedMems_.front()->release();
|
||||
@@ -2672,6 +2563,7 @@ void VirtualGPU::addPinnedMem(amd::Memory* mem) {
|
||||
}
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
void VirtualGPU::releasePinnedMem() {
|
||||
for (auto& amdMemory : pinnedMems_) {
|
||||
amdMemory->release();
|
||||
@@ -2679,6 +2571,7 @@ void VirtualGPU::releasePinnedMem() {
|
||||
pinnedMems_.resize(0);
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
amd::Memory* VirtualGPU::findPinnedMem(void* addr, size_t size) {
|
||||
for (auto& amdMemory : pinnedMems_) {
|
||||
if ((amdMemory->getHostMem() == addr) && (size <= amdMemory->getSize())) {
|
||||
|
||||
@@ -36,10 +36,11 @@ class Memory;
|
||||
class Timestamp;
|
||||
|
||||
struct ProfilingSignal : public amd::HeapObject {
|
||||
hsa_signal_t signal_; //!< HSA signal to track profiling information
|
||||
Timestamp* ts_; //!< Timestamp object associated with the signal
|
||||
hsa_signal_t signal_; //!< HSA signal to track profiling information
|
||||
Timestamp* ts_; //!< Timestamp object associated with the signal
|
||||
bool done_; //!< True if signal is done
|
||||
|
||||
ProfilingSignal() : ts_(nullptr) { signal_.handle = 0; }
|
||||
ProfilingSignal() : ts_(nullptr), done_(true) { signal_.handle = 0; }
|
||||
};
|
||||
|
||||
// Initial HSA signal value
|
||||
@@ -111,13 +112,19 @@ class Timestamp {
|
||||
hsa_amd_profiling_dispatch_time_t time;
|
||||
|
||||
if (splittedDispatch_) {
|
||||
uint64_t start = UINT64_MAX;
|
||||
uint64_t start = std::numeric_limits<uint64_t>::max();
|
||||
uint64_t end = 0;
|
||||
for (auto it = splittedSignals_.begin(); it < splittedSignals_.end(); it++) {
|
||||
if (hsa_signal_load_relaxed(profilingSignal_->signal_) > 0) {
|
||||
WaitForSignal(*it);
|
||||
}
|
||||
hsa_amd_profiling_get_dispatch_time(agent_, *it, &time);
|
||||
if ((time.end - time.start) == 0) {
|
||||
hsa_amd_profiling_async_copy_time_t time_sdma = {};
|
||||
hsa_amd_profiling_get_async_copy_time(profilingSignal_->signal_, &time_sdma);
|
||||
time.start = time_sdma.start;
|
||||
time.end = time_sdma.end;
|
||||
}
|
||||
if (time.start < start) {
|
||||
start = time.start;
|
||||
}
|
||||
@@ -133,10 +140,18 @@ class Timestamp {
|
||||
WaitForSignal(profilingSignal_->signal_);
|
||||
}
|
||||
hsa_amd_profiling_get_dispatch_time(agent_, profilingSignal_->signal_, &time);
|
||||
start_ = time.start * ticksToTime_;
|
||||
end_ = time.end * ticksToTime_;
|
||||
if ((time.end - time.start) == 0) {
|
||||
hsa_amd_profiling_async_copy_time_t time_sdma = {};
|
||||
hsa_amd_profiling_get_async_copy_time(profilingSignal_->signal_, &time_sdma);
|
||||
start_ = time_sdma.start * ticksToTime_;
|
||||
end_ = time_sdma.end * ticksToTime_;
|
||||
} else {
|
||||
start_ = time.start * ticksToTime_;
|
||||
end_ = time.end * ticksToTime_;
|
||||
}
|
||||
}
|
||||
profilingSignal_->ts_ = nullptr;
|
||||
profilingSignal_->done_ = true;
|
||||
profilingSignal_ = nullptr;
|
||||
}
|
||||
}
|
||||
@@ -192,6 +207,109 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
size_t maxMemObjectsInQueue_; //!< Maximum number of mem objects in the queue
|
||||
};
|
||||
|
||||
class HwQueueTracker : public amd::EmbeddedObject {
|
||||
public:
|
||||
HwQueueTracker() {}
|
||||
|
||||
~HwQueueTracker() {
|
||||
for (auto& signal: signal_list_) {
|
||||
if (signal->signal_.handle != 0) {
|
||||
hsa_signal_destroy(signal->signal_);
|
||||
}
|
||||
delete signal;
|
||||
}
|
||||
}
|
||||
|
||||
//! Creates a pool of signals for tracking of HW operations on the queue
|
||||
bool Create(hsa_agent_t agent) {
|
||||
constexpr size_t kSignalListSize = 16;
|
||||
signal_list_.resize(kSignalListSize);
|
||||
for (uint i = 0; i < kSignalListSize; ++i) {
|
||||
ProfilingSignal* signal = new ProfilingSignal();
|
||||
if ((signal == nullptr) || (HSA_STATUS_SUCCESS != hsa_signal_create(
|
||||
0, 1, &agent, &signal->signal_))) {
|
||||
return false;
|
||||
}
|
||||
signal_list_[i] = signal;
|
||||
}
|
||||
agent_ = agent;
|
||||
return true;
|
||||
}
|
||||
|
||||
//! Finds a free signal for the upcomming operation
|
||||
hsa_signal_t ActiveSignal(hsa_signal_value_t init_val = kInitSignalValueOne,
|
||||
Timestamp* ts = nullptr, uint32_t queue_size = 0) {
|
||||
// If queue size grows, then add more signals to avoid more frequent stalls
|
||||
if (queue_size > signal_list_.size()) {
|
||||
ProfilingSignal* signal = new ProfilingSignal();
|
||||
if (signal != nullptr) {
|
||||
if (HSA_STATUS_SUCCESS == hsa_signal_create(
|
||||
0, 1, &agent_, &signal->signal_)) {
|
||||
signal_list_.push_back(signal);
|
||||
}
|
||||
}
|
||||
}
|
||||
// Find valid index
|
||||
++current_id_ %= signal_list_.size();
|
||||
|
||||
// Make sure the previous operation on the current signal is done
|
||||
WaitCurrent();
|
||||
|
||||
// Have to wait the next signal in the queue to avoid a race condition between
|
||||
// a GPU waiter(which may be not triggered yet) and CPU signal reset below
|
||||
WaitNext();
|
||||
|
||||
// Reset the signal and return
|
||||
hsa_signal_silent_store_relaxed(signal_list_[current_id_]->signal_, init_val);
|
||||
signal_list_[current_id_]->done_ = false;
|
||||
if (ts != 0) {
|
||||
if (!sdma_profiling_) {
|
||||
hsa_amd_profiling_async_copy_enable(true);
|
||||
sdma_profiling_ = true;
|
||||
}
|
||||
signal_list_[current_id_]->ts_ = ts;
|
||||
ts->setProfilingSignal(signal_list_[current_id_]);
|
||||
ts->setAgent(agent_);
|
||||
}
|
||||
return signal_list_[current_id_]->signal_;
|
||||
}
|
||||
|
||||
//! Wait for the curent active signal. Can idle the queue
|
||||
bool WaitCurrent() { return WaitIndex(current_id_); }
|
||||
|
||||
//! Returns the last submitted signal for a wait
|
||||
hsa_signal_t WaitSignal() const { return signal_list_[current_id_]->signal_; }
|
||||
|
||||
private:
|
||||
//! Wait for the next active signal
|
||||
void WaitNext() {
|
||||
size_t next = (current_id_ + 1) % signal_list_.size();
|
||||
WaitIndex(next);
|
||||
}
|
||||
|
||||
//! Wait for the provided signal
|
||||
bool WaitIndex(size_t index) {
|
||||
// Wait for the current signal
|
||||
if (!signal_list_[index]->done_) {
|
||||
// Update timestamp values if requested
|
||||
if (signal_list_[index]->ts_ != nullptr) {
|
||||
signal_list_[index]->ts_->checkGpuTime();
|
||||
} else {
|
||||
if (!WaitForSignal(signal_list_[index]->signal_)) {
|
||||
LogPrintfError("Failed signal [0x%lx] wait", signal_list_[index]->signal_);
|
||||
return false;
|
||||
}
|
||||
signal_list_[index]->done_ = true;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
std::vector<ProfilingSignal*> signal_list_; //!< The pool of all signals for processing
|
||||
size_t current_id_ = 0; //!< Last submitted signal
|
||||
hsa_agent_t agent_; //!< HSA device agent
|
||||
bool sdma_profiling_ = false; //!< Don't enable SDMA profiling by default
|
||||
};
|
||||
|
||||
VirtualGPU(Device& device, bool profiling = false, bool cooperative = false,
|
||||
const std::vector<uint32_t>& cuMask = {},
|
||||
amd::CommandQueue::Priority priority = amd::CommandQueue::Priority::Normal);
|
||||
@@ -256,7 +374,7 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
*
|
||||
* @return bool true if Wait returned successfully, false otherwise
|
||||
*/
|
||||
bool releaseGpuMemoryFence(bool force_barrier = false);
|
||||
bool releaseGpuMemoryFence(bool force_barrier = false, bool skip_copy_wait = false);
|
||||
|
||||
hsa_agent_t gpu_device() { return gpu_device_; }
|
||||
hsa_queue_t* gpu_queue() { return gpu_queue_; }
|
||||
@@ -297,6 +415,10 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
void addSystemScope() { addSystemScope_ = true; }
|
||||
void SetCopyCommandType(cl_command_type type) { copy_command_type_ = type; }
|
||||
|
||||
HwQueueTracker& Barriers() { return barriers_; }
|
||||
|
||||
Timestamp* timestamp() const { return timestamp_; }
|
||||
|
||||
// } roc OpenCL integration
|
||||
private:
|
||||
bool dispatchAqlPacket(hsa_kernel_dispatch_packet_t* packet, uint16_t header,
|
||||
@@ -316,7 +438,7 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
void initializeDispatchPacket(hsa_kernel_dispatch_packet_t* packet,
|
||||
amd::NDRangeContainer& sizes);
|
||||
|
||||
bool initPool(size_t kernarg_pool_size, uint signal_pool_count);
|
||||
bool initPool(size_t kernarg_pool_size);
|
||||
void destroyPool();
|
||||
|
||||
void* allocKernArg(size_t size, size_t alignment);
|
||||
@@ -368,7 +490,7 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
uint32_t cooperative_ : 1; //!< Cooperative launch is enabled
|
||||
uint32_t addSystemScope_ : 1; //!< Insert a system scope to the next aql
|
||||
uint32_t isLastCommandSDMA_ : 1; //!< Keep track if the last command was SDMA and
|
||||
//!< not send Barrier packets if barrier_sync is 0
|
||||
//!< not send Barrier packets if barrier_sync is 0
|
||||
};
|
||||
uint32_t state_;
|
||||
};
|
||||
@@ -379,8 +501,7 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
hsa_agent_t gpu_device_; //!< Physical device
|
||||
hsa_queue_t* gpu_queue_; //!< Queue associated with a gpu
|
||||
hsa_barrier_and_packet_t barrier_packet_;
|
||||
hsa_signal_t barrier_signal_;
|
||||
hsa_signal_t last_signal_ = {}; //!< Last submitted signal
|
||||
|
||||
uint32_t dispatch_id_; //!< This variable must be updated atomically.
|
||||
Device& roc_device_; //!< roc device object
|
||||
PrintfDbg* printfdbg_;
|
||||
@@ -396,12 +517,12 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
hsa_queue_t* schedulerQueue_;
|
||||
hsa_signal_t schedulerSignal_;
|
||||
|
||||
HwQueueTracker barriers_; //!< Tracks active barriers in ROCr
|
||||
|
||||
char* kernarg_pool_base_;
|
||||
size_t kernarg_pool_size_;
|
||||
uint kernarg_pool_cur_offset_;
|
||||
|
||||
std::vector<ProfilingSignal> signal_pool_; //!< Pool of signals for profiling
|
||||
uint32_t current_signal_ = 0; //!< Current avaialble signal in the pool
|
||||
friend class Timestamp;
|
||||
|
||||
// PM4 packet for gfx8 performance counter
|
||||
|
||||
Reference in New Issue
Block a user