SWDEV-497145 - Use rocr copyOnEngine API for staged copies

- Refactor blit code and clean ASAN instrumentation
- Use unified function for rocr copy
- Enable shader copy path for unpinned writeBuffer/readBuffer paths
- Set GPU_FORCE_BLIT_COPY_SIZE=16 which means we will use BLIT copy for
  pinned copies or unpinned H2D/D2H copies < 16KB

Change-Id: I42045cca79234b340dbf53dafb93044199736ae4


[ROCm/clr commit: 7863eb92dc]
Этот коммит содержится в:
Saleel Kudchadker
2024-11-08 05:51:17 +00:00
родитель 6933aa7c29
Коммит 7d7aa8b69c
7 изменённых файлов: 359 добавлений и 427 удалений
+292 -364
Просмотреть файл
@@ -48,21 +48,6 @@ inline Memory& DmaBlitManager::gpuMem(device::Memory& mem) const {
return static_cast<Memory&>(mem);
}
// ================================================================================================
bool DmaBlitManager::readMemoryStaged(Memory& srcMemory, void* dstHost, Memory& xferBuf,
size_t origin, size_t& offset, size_t& totalSize,
size_t xferSize) const {
const_address src = srcMemory.getDeviceMemory();
address staging = xferBuf.getDeviceMemory();
// Copy data from device to host
src += origin + offset;
address dst = reinterpret_cast<address>(dstHost) + offset;
bool ret = hsaCopyStaged(src, dst, totalSize, staging, false);
return ret;
}
// ================================================================================================
bool DmaBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost,
const amd::Coord3D& origin, const amd::Coord3D& size,
@@ -77,77 +62,15 @@ bool DmaBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost,
gpu().Barriers().WaitCurrent();
return HostBlitManager::readBuffer(srcMemory, dstHost, origin, size, entire, copyMetadata);
} else {
size_t srcSize = size[0];
size_t offset = 0;
size_t pinSize = dev().settings().pinnedXferSize_;
pinSize = std::min(pinSize, srcSize);
// Check if a pinned transfer can be executed
if (pinSize && (srcSize > MinSizeForPinnedTransfer)) {
// Align offset to 4K boundary
char* tmpHost = const_cast<char*>(
amd::alignDown(reinterpret_cast<const char*>(dstHost), PinnedMemoryAlignment));
// Find the partial size for unaligned copy
size_t partial = reinterpret_cast<const char*>(dstHost) - tmpHost;
amd::Memory* pinned = nullptr;
bool first = true;
size_t tmpSize;
size_t pinAllocSize;
// Copy memory, using pinning
while (srcSize > 0) {
// If it's the first iterarion, then readjust the copy size
// to include alignment
if (first) {
pinAllocSize = amd::alignUp(pinSize + partial, PinnedMemoryAlignment);
tmpSize = std::min(pinAllocSize - partial, srcSize);
first = false;
} else {
tmpSize = std::min(pinSize, srcSize);
pinAllocSize = amd::alignUp(tmpSize, PinnedMemoryAlignment);
partial = 0;
}
amd::Coord3D dst(partial, 0, 0);
amd::Coord3D srcPin(origin[0] + offset, 0, 0);
amd::Coord3D copySizePin(tmpSize, 0, 0);
size_t partial2;
// Allocate a GPU resource for pinning
pinned = pinHostMemory(tmpHost, pinAllocSize, partial2);
if (pinned != nullptr) {
// Get device memory for this virtual device
Memory* dstMemory = dev().getRocMemory(pinned);
const KernelBlitManager *kb = dynamic_cast<const KernelBlitManager*>(this);
if (!kb->copyBuffer(gpuMem(srcMemory), *dstMemory, srcPin, dst,
copySizePin)) {
LogWarning("DmaBlitManager::readBuffer failed a pinned copy!");
gpu().addPinnedMem(pinned);
break;
}
gpu().addPinnedMem(pinned);
} else {
LogWarning("DmaBlitManager::readBuffer failed to pin a resource!");
break;
}
srcSize -= tmpSize;
offset += tmpSize;
tmpHost = reinterpret_cast<char*>(tmpHost) + tmpSize + partial;
}
}
if (0 != srcSize) {
Memory& xferBuf = dev().xferRead().acquire();
// Read memory using a staging resource
if (!readMemoryStaged(gpuMem(srcMemory), dstHost, xferBuf, origin[0], offset, srcSize,
srcSize)) {
LogError("DmaBlitManager::readBuffer failed!");
size_t copySize = size[0];
if (0 != copySize) {
const_address addrSrc = gpuMem(srcMemory).getDeviceMemory() + origin[0];
address addrDst = reinterpret_cast<address>(dstHost);
constexpr bool kHostToDev = false;
if(!hsaCopyStaged(addrSrc, addrDst, copySize, kHostToDev, copyMetadata)) {
LogError("DmaBlitManager::readBuffer staged copy failed!");
return false;
}
dev().xferRead().release(gpu(), xferBuf);
}
}
@@ -170,8 +93,6 @@ bool DmaBlitManager::readBufferRect(device::Memory& srcMemory, void* dstHost,
gpu().Barriers().WaitCurrent();
return HostBlitManager::readBufferRect(srcMemory, dstHost, bufRect, hostRect, size, entire, copyMetadata);
} else {
Memory& xferBuf = dev().xferRead().acquire();
address staging = xferBuf.getDeviceMemory();
const_address src = gpuMem(srcMemory).getDeviceMemory();
size_t srcOffset;
@@ -184,13 +105,12 @@ bool DmaBlitManager::readBufferRect(device::Memory& srcMemory, void* dstHost,
// Copy data from device to host - line by line
address dst = reinterpret_cast<address>(dstHost) + dstOffset;
bool retval = hsaCopyStaged(src + srcOffset, dst, size[0], staging, false);
bool retval = hsaCopyStaged(src + srcOffset, dst, size[0], false, copyMetadata);
if (!retval) {
return retval;
}
}
}
dev().xferRead().release(gpu(), xferBuf);
}
return true;
@@ -216,20 +136,6 @@ bool DmaBlitManager::readImage(device::Memory& srcMemory, void* dstHost,
return true;
}
// ================================================================================================
bool DmaBlitManager::writeMemoryStaged(const void* srcHost, Memory& dstMemory, address staging,
size_t origin, size_t& offset, size_t& totalSize,
size_t xferSize) const {
address dst = dstMemory.getDeviceMemory();
// Copy data from host to device
dst += origin + offset;
const_address src = reinterpret_cast<const_address>(srcHost) + offset;
bool retval = hsaCopyStaged(src, dst, totalSize, staging, true);
return retval;
}
// ================================================================================================
bool DmaBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemory,
const amd::Coord3D& origin, const amd::Coord3D& size,
@@ -241,79 +147,17 @@ bool DmaBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemory,
gpu().releaseGpuMemoryFence();
return HostBlitManager::writeBuffer(srcHost, dstMemory, origin, size, entire, copyMetadata);
} else {
// HSA copy functionality with a possible async operation
gpu().releaseGpuMemoryFence(kSkipCpuWait);
size_t dstSize = size[0];
size_t tmpSize = 0;
size_t offset = 0;
size_t pinSize = dev().settings().pinnedXferSize_;
pinSize = std::min(pinSize, dstSize);
// Check if a pinned transfer can be executed
if (pinSize && (dstSize > MinSizeForPinnedTransfer)) {
// Align offset to 4K boundary
char* tmpHost = const_cast<char*>(
amd::alignDown(reinterpret_cast<const char*>(srcHost), PinnedMemoryAlignment));
// Find the partial size for unaligned copy
size_t partial = reinterpret_cast<const char*>(srcHost) - tmpHost;
amd::Memory* pinned = nullptr;
bool first = true;
size_t tmpSize;
size_t pinAllocSize;
// Copy memory, using pinning
while (dstSize > 0) {
// If it's the first iterarion, then readjust the copy size
// to include alignment
if (first) {
pinAllocSize = amd::alignUp(pinSize + partial, PinnedMemoryAlignment);
tmpSize = std::min(pinAllocSize - partial, dstSize);
first = false;
} else {
tmpSize = std::min(pinSize, dstSize);
pinAllocSize = amd::alignUp(tmpSize, PinnedMemoryAlignment);
partial = 0;
}
amd::Coord3D src(partial, 0, 0);
amd::Coord3D dstPin(origin[0] + offset, 0, 0);
amd::Coord3D copySizePin(tmpSize, 0, 0);
size_t partial2;
// Allocate a GPU resource for pinning
pinned = pinHostMemory(tmpHost, pinAllocSize, partial2);
if (pinned != nullptr) {
// Get device memory for this virtual device
Memory* srcMemory = dev().getRocMemory(pinned);
const KernelBlitManager *kb = dynamic_cast<const KernelBlitManager*>(this);
if (!kb->copyBuffer(*srcMemory, gpuMem(dstMemory), src, dstPin,
copySizePin)) {
LogWarning("DmaBlitManager::writeBuffer failed a pinned copy!");
gpu().addPinnedMem(pinned);
break;
}
gpu().addPinnedMem(pinned);
} else {
LogWarning("DmaBlitManager::writeBuffer failed to pin a resource!");
break;
}
dstSize -= tmpSize;
offset += tmpSize;
tmpHost = reinterpret_cast<char*>(tmpHost) + tmpSize + partial;
}
}
if (dstSize != 0) {
address staging = gpu().Staging().Acquire(
std::min(dstSize, dev().settings().stagedXferSize_));
size_t copySize = size[0];
// For small copies use managed staging buffers which can be non blocking
if (copySize != 0) {
address dstAddr = gpuMem(dstMemory).getDeviceMemory() + origin[0];
const_address srcAddr = reinterpret_cast<const_address>(srcHost);
// Write memory using a staging resource
if (!writeMemoryStaged(srcHost, gpuMem(dstMemory), staging, origin[0], offset, dstSize,
dstSize)) {
LogError("DmaBlitManager::writeBuffer failed!");
constexpr bool kHostToDev = true;
bool result = hsaCopyStaged(srcAddr, dstAddr, copySize, kHostToDev, copyMetadata);
if (!result) {
LogError("DmaBlitManager::writeBuffer staging copy failed!");
return false;
}
}
@@ -336,8 +180,6 @@ bool DmaBlitManager::writeBufferRect(const void* srcHost, device::Memory& dstMem
return HostBlitManager::writeBufferRect(srcHost, dstMemory, hostRect, bufRect, size, entire,
copyMetadata);
} else {
address staging = gpu().Staging().Acquire(
std::min(size[0], dev().settings().stagedXferSize_));
address dst = static_cast<roc::Memory&>(dstMemory).getDeviceMemory();
size_t srcOffset;
@@ -350,7 +192,8 @@ bool DmaBlitManager::writeBufferRect(const void* srcHost, device::Memory& dstMem
// Copy data from host to device - line by line
const_address src = reinterpret_cast<const_address>(srcHost) + srcOffset;
bool retval = hsaCopyStaged(src, dst + dstOffset, size[0], staging, true);
constexpr bool kHostToDev = true;
bool retval = hsaCopyStaged(src, dst + dstOffset, size[0], kHostToDev, copyMetadata);
if (!retval) {
return retval;
}
@@ -632,43 +475,10 @@ bool DmaBlitManager::copyImage(device::Memory& srcMemory, device::Memory& dstMem
}
// ================================================================================================
bool DmaBlitManager::hsaCopy(const Memory& srcMemory, const Memory& dstMemory,
const amd::Coord3D& srcOrigin, const amd::Coord3D& dstOrigin,
const amd::Coord3D& size, amd::CopyMetadata copyMetadata) const {
address src = reinterpret_cast<address>(srcMemory.getDeviceMemory());
address dst = reinterpret_cast<address>(dstMemory.getDeviceMemory());
gpu().releaseGpuMemoryFence(kSkipCpuWait);
src += srcOrigin[0];
dst += dstOrigin[0];
// Just call copy function for full profile
inline bool DmaBlitManager::rocrCopyBuffer(address dst, hsa_agent_t& dstAgent,
const_address src, hsa_agent_t& srcAgent, size_t size,
amd::CopyMetadata& copyMetadata) const {
hsa_status_t status = HSA_STATUS_SUCCESS;
if (dev().agent_profile() == HSA_PROFILE_FULL) {
// Stall GPU, sicne CPU copy is possible
gpu().Barriers().WaitCurrent();
status = hsa_memory_copy(dst, src, size[0]);
if (status != HSA_STATUS_SUCCESS) {
LogPrintfError("Hsa copy of data failed with code %d", status);
}
return (status == HSA_STATUS_SUCCESS);
}
hsa_agent_t srcAgent;
hsa_agent_t dstAgent;
if (&srcMemory.dev() == &dstMemory.dev()) {
// Detect the agents for memory allocations
srcAgent =
(srcMemory.isHostMemDirectAccess()) ? dev().getCpuAgent() : dev().getBackendDevice();
dstAgent =
(dstMemory.isHostMemDirectAccess()) ? dev().getCpuAgent() : dev().getBackendDevice();
}
else {
srcAgent = srcMemory.dev().getBackendDevice();
dstAgent = dstMemory.dev().getBackendDevice();
}
uint32_t copyMask = 0;
uint32_t freeEngineMask = 0;
@@ -707,9 +517,11 @@ bool DmaBlitManager::hsaCopy(const Memory& srcMemory, const Memory& dstMemory,
}
// Check if host wait has to be forced
bool forceHostWait = forceHostWaitFunc(size[0]);
bool forceHostWait = forceHostWaitFunc(size);
auto wait_events = gpu().Barriers().WaitingSignal(engine);
constexpr bool kIgnoreHostWait = false;
// Ignore waiting on any previous kernel dispatch and queue a signal to ROCr copy api instead
auto wait_events = gpu().Barriers().WaitingSignal(engine, kIgnoreHostWait);
hsa_signal_t active = gpu().Barriers().ActiveSignal(kInitSignalValueOne, gpu().timestamp(),
forceHostWait);
@@ -740,11 +552,11 @@ bool DmaBlitManager::hsaCopy(const Memory& srcMemory, const Memory& dstMemory,
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY,
"HSA Async Copy on copy_engine=0x%x, dst=0x%zx, src=0x%zx, "
"size=%ld, forceSDMA=%d, wait_event=0x%zx, completion_signal=0x%zx", copyEngine,
dst, src, size[0], forceSDMA, (wait_events.size() != 0) ? wait_events[0].handle : 0,
dst, src, size, forceSDMA, (wait_events.size() != 0) ? wait_events[0].handle : 0,
active.handle);
status = hsa_amd_memory_async_copy_on_engine(dst, dstAgent, src, srcAgent,
size[0], wait_events.size(),
size, wait_events.size(),
wait_events.data(), active, copyEngine,
forceSDMA);
} else {
@@ -756,11 +568,11 @@ bool DmaBlitManager::hsaCopy(const Memory& srcMemory, const Memory& dstMemory,
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY,
"HSA Async Copy dst=0x%zx, src=0x%zx, size=%ld, wait_event=0x%zx, "
"completion_signal=0x%zx",
dst, src, size[0], (wait_events.size() != 0) ? wait_events[0].handle : 0,
dst, src, size, (wait_events.size() != 0) ? wait_events[0].handle : 0,
active.handle);
status = hsa_amd_memory_async_copy(dst, dstAgent, src, srcAgent,
size[0], wait_events.size(), wait_events.data(), active);
size, wait_events.size(), wait_events.data(), active);
}
if (status == HSA_STATUS_SUCCESS) {
@@ -773,93 +585,104 @@ bool DmaBlitManager::hsaCopy(const Memory& srcMemory, const Memory& dstMemory,
return (status == HSA_STATUS_SUCCESS);
}
// ================================================================================================
bool DmaBlitManager::hsaCopy(const Memory& srcMemory, const Memory& dstMemory,
const amd::Coord3D& srcOrigin, const amd::Coord3D& dstOrigin,
const amd::Coord3D& size, amd::CopyMetadata& copyMetadata) const {
address src = reinterpret_cast<address>(srcMemory.getDeviceMemory());
address dst = reinterpret_cast<address>(dstMemory.getDeviceMemory());
gpu().releaseGpuMemoryFence(kSkipCpuWait);
src += srcOrigin[0];
dst += dstOrigin[0];
hsa_agent_t srcAgent;
hsa_agent_t dstAgent;
if (&srcMemory.dev() == &dstMemory.dev()) {
// Detect the agents for memory allocations
srcAgent =
(srcMemory.isHostMemDirectAccess()) ? dev().getCpuAgent() : dev().getBackendDevice();
dstAgent =
(dstMemory.isHostMemDirectAccess()) ? dev().getCpuAgent() : dev().getBackendDevice();
}
else {
srcAgent = srcMemory.dev().getBackendDevice();
dstAgent = dstMemory.dev().getBackendDevice();
}
return rocrCopyBuffer(dst, dstAgent, src, srcAgent, size[0], copyMetadata);
}
// ================================================================================================
bool DmaBlitManager::hsaCopyStaged(const_address hostSrc, address hostDst, size_t size,
address staging, bool hostToDev) const {
bool hostToDev, amd::CopyMetadata& copyMetadata) const {
// Stall GPU, sicne CPU copy is possible
gpu().releaseGpuMemoryFence(hostToDev);
// No allocation is necessary for Full Profile
hsa_status_t status;
if (dev().agent_profile() == HSA_PROFILE_FULL) {
status = hsa_memory_copy(hostDst, hostSrc, size);
if (status != HSA_STATUS_SUCCESS) {
LogPrintfError("Hsa copy of data failed with code %d", status);
}
return (status == HSA_STATUS_SUCCESS);
}
size_t totalSize = size;
size_t offset = 0;
size_t stagedCopyOffset = 0;
bool status = true;
Memory* xferBuf = nullptr;
address stagingBuffer = 0;
size_t maxStagedXferSize = dev().settings().stagedXferSize_;
address hsaBuffer = staging;
if (!hostToDev) {
// Get static staging buffer as we need to wait until copy on GPU completes to copy
// it back to the unpinned buffer
xferBuf = &dev().xferRead().acquire();
stagingBuffer = xferBuf->getDeviceMemory();
}
// Allocate requested size of memory
while (totalSize > 0) {
size = std::min(totalSize, dev().settings().stagedXferSize_);
size = std::min(totalSize, maxStagedXferSize);
hsa_agent_t srcAgent;
hsa_agent_t dstAgent;
// Copy data from Host to Device
if (hostToDev) {
const hsa_agent_t srcAgent = dev().getCpuAgent();
hsa_agent_t srcAgent = dev().getCpuAgent();
hsa_agent_t dstAgent = dev().getBackendDevice();
HwQueueEngine engine = HwQueueEngine::Unknown;
if (srcAgent.handle == dev().getBackendDevice().handle) {
engine = HwQueueEngine::SdmaWrite;
// Get an address from managed staging buffer
stagingBuffer = gpu().Staging().Acquire(std::min(size, maxStagedXferSize));
address dst = hostDst + stagedCopyOffset;
memcpy(stagingBuffer, hostSrc + stagedCopyOffset, size);
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "HSA Async Copy staged H2D");
status = rocrCopyBuffer(dst, dstAgent, stagingBuffer, srcAgent, size, copyMetadata);
if (!status) {
break;
}
gpu().Barriers().SetActiveEngine(engine);
auto wait_events = gpu().Barriers().WaitingSignal(engine);
hsa_signal_t active = gpu().Barriers().ActiveSignal(kInitSignalValueOne, gpu().timestamp());
memcpy(hsaBuffer, hostSrc + offset, size);
status = hsa_amd_memory_async_copy(
hostDst + offset, dev().getBackendDevice(), hsaBuffer, srcAgent, size,
wait_events.size(), wait_events.data(), active);
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY,
"HSA Async Copy staged H2D dst=0x%zx, src=0x%zx, size=%ld, completion_signal=0x%zx",
hostDst + offset, hsaBuffer, size, active.handle);
if (status != HSA_STATUS_SUCCESS) {
gpu().Barriers().ResetCurrentSignal();
LogPrintfError("Hsa copy from host to device failed with code %d", status);
return false;
}
totalSize -= size;
if (totalSize > 0) {
// Wait if there are extra copies, which don't fit in a single staging buffer
gpu().Barriers().WaitCurrent();
}
offset += size;
continue;
}
const hsa_agent_t dstAgent = dev().getCpuAgent();
HwQueueEngine engine = HwQueueEngine::Unknown;
if (dstAgent.handle == dev().getBackendDevice().handle) {
engine = HwQueueEngine::SdmaRead;
}
gpu().Barriers().SetActiveEngine(engine);
auto wait_events = gpu().Barriers().WaitingSignal(engine);
hsa_signal_t active = gpu().Barriers().ActiveSignal(kInitSignalValueOne, gpu().timestamp());
// Copy data from Device to Host
status = hsa_amd_memory_async_copy(
hsaBuffer, dstAgent, hostSrc + offset, dev().getBackendDevice(), size,
wait_events.size(), wait_events.data(), active);
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY,
"HSA Async Copy staged D2H dst=0x%zx, src=0x%zx, size=%ld, completion_signal=0x%zx",
hsaBuffer, hostSrc + offset, size, active.handle);
if (status == HSA_STATUS_SUCCESS) {
gpu().Barriers().WaitCurrent();
memcpy(hostDst + offset, hsaBuffer, size);
} else {
gpu().Barriers().ResetCurrentSignal();
LogPrintfError("Hsa copy from device to host failed with code %d", status);
return false;
dstAgent = dev().getCpuAgent();
srcAgent = dev().getBackendDevice();
const_address src = static_cast<const_address>(hostSrc) + stagedCopyOffset;
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "HSA Async Copy staged D2H");
status = rocrCopyBuffer(stagingBuffer, dstAgent, src, srcAgent, size, copyMetadata);
if (status) {
gpu().Barriers().WaitCurrent();
memcpy(hostDst + stagedCopyOffset, stagingBuffer, size);
} else {
break;
}
}
totalSize -= size;
offset += size;
stagedCopyOffset += size;
}
if (!hostToDev) {
dev().xferRead().release(gpu(), *xferBuf);
}
if (!status) {
return false;
}
gpu().addSystemScope();
@@ -1829,13 +1652,13 @@ bool KernelBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost,
synchronize();
return result;
} else {
size_t pinSize = size[0];
size_t totalSize = size[0];
// Check if a pinned transfer can be executed with a single pin
if (((pinSize <= dev().settings().pinnedXferSize_) && (pinSize > MinSizeForPinnedTransfer))) {
if (((totalSize <= dev().settings().pinnedXferSize_) &&
(totalSize > MinSizeForPinnedTransfer))) {
size_t partial;
amd::Memory* amdMemory = pinHostMemory(dstHost, pinSize, partial);
amd::Memory* amdMemory = pinHostMemory(dstHost, totalSize, partial);
if (amdMemory == nullptr) {
// Force SW copy
@@ -1857,7 +1680,55 @@ bool KernelBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost,
// Add pinned memory for a later release
gpu().addPinnedMem(amdMemory);
} else {
result = DmaBlitManager::readBuffer(srcMemory, dstHost, origin, size, entire, copyMetadata);
// Do a staging copy
bool useShaderCopyPath = setup_.disableHwlCopyBuffer_ ||
(totalSize <= dev().settings().sdmaCopyThreshold_) ||
(copyMetadata.copyEnginePreference_ ==
amd::CopyMetadata::CopyEnginePreference::BLIT);
if (!useShaderCopyPath) {
// HSA copy using a staging resource
result = DmaBlitManager::readBuffer(srcMemory, dstHost, origin, size,
entire, copyMetadata);
}
if (!result) {
// Blit copy using a staging resource
address srcAddr = gpuMem(srcMemory).getDeviceMemory();
address dstAddr = reinterpret_cast<address>(dstHost);
amd::Coord3D dstOrigin(0, 0, 0);
size_t copySize = 0;
size_t stagedCopyOffset = 0;
size_t maxStagedXferSize = dev().settings().stagedXferSize_;
Memory& xferBuf = dev().xferRead().acquire();
address xferBufAddr = xferBuf.getDeviceMemory();
constexpr bool kAttachSignal = true;
while (totalSize > 0) {
copySize = std::min(totalSize, maxStagedXferSize);
srcAddr += stagedCopyOffset;
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "Blit staging D2H copy stg buf=%p, src=%p, "
"dstOrigin=%zu, size=%zu", xferBufAddr, srcAddr, dstOrigin[0], copySize);
// Flush caches for coherency after the copy as we need to std::memcpy
// from staging buffer to unpinned dst. Also attach a signal to the dispatch packet
// itself that we can wait on without extra barrier packet.
gpu().addSystemScope();
result = shaderCopyBuffer(xferBufAddr, srcAddr, dstOrigin, origin, copySize,
entire, dev().settings().limit_blit_wg_, copyMetadata,
kAttachSignal);
if (!result) {
break;
}
// Wait on current signal of previous blit copy
gpu().Barriers().WaitCurrent();
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "memcpy host dst=%p, stg buf=%p, size=%zu",
(void*)(dstAddr + stagedCopyOffset), xferBufAddr, copySize);
memcpy(dstAddr + stagedCopyOffset, xferBufAddr, copySize);
totalSize -= copySize;
stagedCopyOffset += copySize;
}
dev().xferRead().release(gpu(), xferBuf);
}
}
}
@@ -1934,16 +1805,20 @@ bool KernelBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemo
synchronize();
return result;
} else {
size_t pinSize = size[0];
size_t totalSize = size[0];
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "Unpinned write path");
// If size > min pinned size, do a pinning copy, since we are limited by staging buffer size
// Check if a pinned transfer can be executed with a single pin
if ((pinSize <= dev().settings().pinnedXferSize_) && (pinSize > MinSizeForPinnedTransfer)) {
if ((totalSize <= dev().settings().pinnedXferSize_) &&
(totalSize > MinSizeForPinnedTransfer)) {
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "Pinned write copy for size=%ld", totalSize);
size_t partial;
amd::Memory* amdMemory = pinHostMemory(srcHost, pinSize, partial);
amd::Memory* amdMemory = pinHostMemory(srcHost, totalSize, partial);
if (amdMemory == nullptr) {
// Force SW copy
result = DmaBlitManager::writeBuffer(srcHost, dstMemory, origin, size, entire, copyMetadata);
result = DmaBlitManager::writeBuffer(srcHost, dstMemory, origin,
size, entire, copyMetadata);
synchronize();
return result;
}
@@ -1960,7 +1835,47 @@ bool KernelBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemo
// Add pinned memory for a later release
gpu().addPinnedMem(amdMemory);
} else {
result = DmaBlitManager::writeBuffer(srcHost, dstMemory, origin, size, entire, copyMetadata);
// Do a staging copy
bool useShaderCopyPath = setup_.disableHwlCopyBuffer_ ||
(totalSize <= dev().settings().sdmaCopyThreshold_) ||
(copyMetadata.copyEnginePreference_ ==
amd::CopyMetadata::CopyEnginePreference::BLIT);
if (!useShaderCopyPath) {
// HSA copy using a staging resource
result = DmaBlitManager::writeBuffer(srcHost, dstMemory, origin,
size, entire, copyMetadata);
}
if (!result) {
// Blit copy using a staging resource
address dstAddr = gpuMem(dstMemory).getDeviceMemory();
const_address srcAddr = reinterpret_cast<const_address>(srcHost);
amd::Coord3D srcOrigin(0, 0, 0);
size_t copySize = 0;
size_t stagedCopyOffset = 0;
size_t maxStagedXferSize = dev().settings().stagedXferSize_;
while (totalSize > 0) {
copySize = std::min(totalSize, maxStagedXferSize);
// Get an address from managed staging buffer
address stagingBuffer = gpu().Staging().Acquire(std::min(copySize, maxStagedXferSize));
dstAddr += stagedCopyOffset;
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "memcpy stg buf=%p, host src=%p, size=%zu",
stagingBuffer, (void*)(srcAddr + stagedCopyOffset), copySize);
memcpy(stagingBuffer, srcAddr + stagedCopyOffset, copySize);
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "Blit staging H2D copy dst=%p, stg buf=%p, "
"dstOrigin=%zu, size=%zu", dstAddr, stagingBuffer, origin[0], copySize);
result = shaderCopyBuffer(dstAddr, stagingBuffer,
origin, srcOrigin, copySize,
entire, dev().settings().limit_blit_wg_, copyMetadata);
if (!result) {
break;
}
totalSize -= copySize;
stagedCopyOffset += copySize;
}
}
}
}
@@ -2230,6 +2145,68 @@ bool KernelBlitManager::fillBuffer3D(device::Memory& memory, const void* pattern
ShouldNotReachHere();
return false;
}
// ================================================================================================
bool KernelBlitManager::shaderCopyBuffer(address dst, address src,
const amd::Coord3D& dstOrigin,
const amd::Coord3D& srcOrigin,
const amd::Coord3D& sizeIn, bool entire,
const uint32_t blitWg,
amd::CopyMetadata copyMetadata,
bool attachSignal) const {
constexpr uint32_t kBlitType = BlitCopyBuffer;
constexpr uint32_t kMaxAlignment = 2 * sizeof(uint64_t);
amd::Coord3D size(sizeIn[0]);
// Check alignments for source and destination
bool aligned = ((srcOrigin[0] % kMaxAlignment) == 0) && ((dstOrigin[0] % kMaxAlignment) == 0);
uint32_t aligned_size = (aligned) ? kMaxAlignment : sizeof(uint32_t);
// Setup copy size accordingly to the alignment
uint32_t remainder = size[0] % aligned_size;
size.c[0] /= aligned_size;
size.c[0] += (remainder != 0) ? 1 : 0;
// Program the dispatch dimensions
const size_t localWorkSize = (aligned) ? 512 : 1024;
size_t globalWorkSize = std::min(blitWg * localWorkSize, size[0]);
globalWorkSize = amd::alignUp(globalWorkSize, localWorkSize);
// Program kernels arguments for the blit operation
// Program source origin
setArgument(kernels_[kBlitType], 0, sizeof(src), reinterpret_cast<void*>(src),
srcOrigin[0], nullptr, true);
// Program destinaiton origin
setArgument(kernels_[kBlitType], 1, sizeof(dst), reinterpret_cast<void*>(dst),
dstOrigin[0], nullptr, true);
uint64_t copySize = sizeIn[0];
setArgument(kernels_[kBlitType], 2, sizeof(copySize), &copySize);
setArgument(kernels_[kBlitType], 3, sizeof(remainder), &remainder);
setArgument(kernels_[kBlitType], 4, sizeof(aligned_size), &aligned_size);
// End pointer is the aligned copy size and destination offset
uint64_t end_ptr = reinterpret_cast<uint64_t>(dst) + dstOrigin[0] + sizeIn[0] - remainder;
setArgument(kernels_[kBlitType], 5, sizeof(end_ptr), &end_ptr);
uint32_t next_chunk = globalWorkSize;
setArgument(kernels_[kBlitType], 6, sizeof(next_chunk), &next_chunk);
// Create ND range object for the kernel's execution
amd::NDRangeContainer ndrange(1, nullptr, &globalWorkSize, &localWorkSize);
// Execute the blit
address parameters = captureArguments(kernels_[kBlitType]);
bool result = gpu().submitKernelInternal(ndrange, *kernels_[kBlitType], parameters, nullptr,
0, nullptr, nullptr, attachSignal);
releaseArguments(parameters);
return result;
}
// ================================================================================================
bool KernelBlitManager::copyBuffer(device::Memory& srcMemory, device::Memory& dstMemory,
const amd::Coord3D& srcOrigin, const amd::Coord3D& dstOrigin,
@@ -2238,32 +2215,28 @@ bool KernelBlitManager::copyBuffer(device::Memory& srcMemory, device::Memory& ds
amd::ScopedLock k(lockXferOps_);
bool result = false;
bool p2p = false;
uint32_t blit_wg_ = dev().settings().limit_blit_wg_;
uint32_t blitWg = dev().settings().limit_blit_wg_;
if (&gpuMem(srcMemory).dev() != &gpuMem(dstMemory).dev()) {
if (sizeIn[0] > dev().settings().sdma_p2p_threshold_) {
p2p = true;
} else {
constexpr uint32_t kLimitWgForKernelP2p = 16;
blit_wg_ = kLimitWgForKernelP2p;
blitWg = kLimitWgForKernelP2p;
}
}
bool asan = false;
bool ipcShared = srcMemory.owner()->ipcShared() || dstMemory.owner()->ipcShared();
#if defined(__clang__)
#if __has_feature(address_sanitizer)
asan = true;
#endif
#endif
bool useShaderCopyPath = setup_.disableHwlCopyBuffer_ ||
(sizeIn[0] <= dev().settings().sdmaCopyThreshold_) ||
(!(p2p || asan || ipcShared) &&
(!srcMemory.isHostMemDirectAccess() && !dstMemory.isHostMemDirectAccess() &&
!(copyMetadata.copyEnginePreference_ ==
amd::CopyMetadata::CopyEnginePreference::SDMA)) ||
(copyMetadata.copyEnginePreference_ == amd::CopyMetadata::CopyEnginePreference::BLIT));
bool useShaderCopyPath = setup_.disableHwlCopyBuffer_ ||
(sizeIn[0] <= dev().settings().sdmaCopyThreshold_) ||
(!(p2p || ipcShared) &&
(!srcMemory.isHostMemDirectAccess()
&& !dstMemory.isHostMemDirectAccess() &&
!(copyMetadata.copyEnginePreference_ ==
amd::CopyMetadata::CopyEnginePreference::SDMA)) ||
(copyMetadata.copyEnginePreference_ ==
amd::CopyMetadata::CopyEnginePreference::BLIT));
if (!useShaderCopyPath) {
if (amd::IS_HIP) {
@@ -2275,60 +2248,15 @@ bool KernelBlitManager::copyBuffer(device::Memory& srcMemory, device::Memory& ds
gpu().SetCopyCommandType(CL_COMMAND_READ_BUFFER);
}
}
result = DmaBlitManager::copyBuffer(srcMemory, dstMemory, srcOrigin, dstOrigin, sizeIn, entire,
copyMetadata);
result = DmaBlitManager::copyBuffer(srcMemory, dstMemory, srcOrigin, dstOrigin, sizeIn,
entire, copyMetadata);
}
if (!result) {
constexpr uint32_t kBlitType = BlitCopyBuffer;
constexpr uint32_t kMaxAlignment = 2 * sizeof(uint64_t);
amd::Coord3D size(sizeIn[0]);
// Check alignments for source and destination
bool aligned = ((srcOrigin[0] % kMaxAlignment) == 0) && ((dstOrigin[0] % kMaxAlignment) == 0);
uint32_t aligned_size = (aligned) ? kMaxAlignment : sizeof(uint32_t);
// Setup copy size accordingly to the alignment
uint32_t remainder = size[0] % aligned_size;
size.c[0] /= aligned_size;
size.c[0] += (remainder != 0) ? 1 : 0;
// Program the dispatch dimensions
const size_t localWorkSize = (aligned) ? 512 : 1024;
size_t globalWorkSize = std::min(blit_wg_ * localWorkSize, size[0]);
globalWorkSize = amd::alignUp(globalWorkSize, localWorkSize);
// Program kernels arguments for the blit operation
cl_mem mem = as_cl<amd::Memory>(srcMemory.owner());
// Program source origin
uint64_t srcOffset = srcOrigin[0];
setArgument(kernels_[kBlitType], 0, sizeof(cl_mem), &mem, srcOffset, &srcMemory);
mem = as_cl<amd::Memory>(dstMemory.owner());
// Program destinaiton origin
uint64_t dstOffset = dstOrigin[0];
setArgument(kernels_[kBlitType], 1, sizeof(cl_mem), &mem, dstOffset, &dstMemory);
uint64_t copySize = sizeIn[0];
setArgument(kernels_[kBlitType], 2, sizeof(copySize), &copySize);
setArgument(kernels_[kBlitType], 3, sizeof(remainder), &remainder);
setArgument(kernels_[kBlitType], 4, sizeof(aligned_size), &aligned_size);
// End pointer is the aligned copy size and destination offset
uint64_t end_ptr = dstMemory.virtualAddress() + dstOffset + sizeIn[0] - remainder;
setArgument(kernels_[kBlitType], 5, sizeof(end_ptr), &end_ptr);
uint32_t next_chunk = globalWorkSize;
setArgument(kernels_[kBlitType], 6, sizeof(next_chunk), &next_chunk);
// Create ND range object for the kernel's execution
amd::NDRangeContainer ndrange(1, nullptr, &globalWorkSize, &localWorkSize);
// Execute the blit
address parameters = captureArguments(kernels_[kBlitType]);
result = gpu().submitKernelInternal(ndrange, *kernels_[kBlitType], parameters, nullptr);
releaseArguments(parameters);
result = shaderCopyBuffer(reinterpret_cast<address>(dstMemory.virtualAddress()),
reinterpret_cast<address>(srcMemory.virtualAddress()),
dstOrigin, srcOrigin, sizeIn,
entire, blitWg, copyMetadata);
}
synchronize();
+16 -26
Просмотреть файл
@@ -231,7 +231,11 @@ class DmaBlitManager : public device::HostBlitManager {
//! taking into account the Hsail profile supported by Hsa Agent
bool hsaCopy(const Memory& srcMemory, const Memory& dstMemory, const amd::Coord3D& srcOrigin,
const amd::Coord3D& dstOrigin, const amd::Coord3D& size,
amd::CopyMetadata copyMetadata) const;
amd::CopyMetadata& copyMetadata) const;
inline bool rocrCopyBuffer(address dst, hsa_agent_t& dstAgent,
const_address src, hsa_agent_t& srcAgent, size_t size,
amd::CopyMetadata& copyMetadata) const;
const size_t MinSizeForPinnedTransfer;
bool completeOperation_; //!< DMA blit manager must complete operation
@@ -248,33 +252,13 @@ class DmaBlitManager : public device::HostBlitManager {
//! Disable operator=
DmaBlitManager& operator=(const DmaBlitManager&);
//! Reads video memory, using a staged buffer
bool readMemoryStaged(Memory& srcMemory, //!< Source memory object
void* dstHost, //!< Destination host memory
Memory& xferBuf, //!< Staged buffer for read
size_t origin, //!< Original offset in the source memory
size_t& offset, //!< Offset for the current copy pointer
size_t& totalSize, //!< Total size for copy region
size_t xferSize //!< Transfer size
) const;
//! Write into video memory, using a staged buffer
bool writeMemoryStaged(const void* srcHost, //!< Source host memory
Memory& dstMemory, //!< Destination memory object
address staging, //!< Staged buffer for write
size_t origin, //!< Original offset in the destination memory
size_t& offset, //!< Offset for the current copy pointer
size_t& totalSize, //!< Total size for the copy region
size_t xferSize //!< Transfer size
) const;
//! Assits in transferring data from Host to Local or vice versa
//! taking into account the Hsail profile supported by Hsa Agent
bool hsaCopyStaged(const_address hostSrc, //!< Contains source data to be copied
address hostDst, //!< Destination buffer address for copying
size_t size, //!< Size of data to copy in bytes
address staging, //!< Staging resource
bool hostToDev //!< True if data is copied from Host To Device
bool hsaCopyStaged(const_address hostSrc, //!< Contains source data to be copied
address hostDst, //!< Destination buffer address for copying
size_t size, //!< Size of data to copy in bytes
bool hostToDev, //!< True if data is copied from H2D
amd::CopyMetadata& copyMetadata //!< Memory copy MetaData
) const;
bool forceHostWaitFunc(size_t copy_size) const;
@@ -583,6 +567,12 @@ class KernelBlitManager : public DmaBlitManager {
return (dev().info().imageSupport_) ? BlitTotal : BlitLinearTotal;
}
//! Copies a buffer using the shader path
bool shaderCopyBuffer(address dst, address src,
const amd::Coord3D& dstOrigin, const amd::Coord3D& srcOrigin,
const amd::Coord3D& size, bool entire, const uint32_t blitWg,
amd::CopyMetadata copyMetadata, bool attachSignal = false) const;
//! Disable copy constructor
KernelBlitManager(const KernelBlitManager&);
+2 -1
Просмотреть файл
@@ -1202,7 +1202,8 @@ bool Device::populateOCLDeviceConstants() {
&info_.globalMemCacheLineSize_)) {
return false;
}
assert(info_.globalMemCacheLineSize_ > 0);
info_.globalMemCacheLineSize_ = (info_.globalMemCacheLineSize_ != 0) ?
info_.globalMemCacheLineSize_ : 64;
uint32_t cachesize[4] = {0};
if (HSA_STATUS_SUCCESS !=
+1 -2
Просмотреть файл
@@ -57,8 +57,7 @@ bool LightningKernel::postLoad() {
}
kernargSegmentAlignment_ =
amd::alignUp(std::max(kernargSegmentAlignment_, 128u),
device().info().globalMemCacheLineSize_ > 0 ?
device().info().globalMemCacheLineSize_ : 64);
device().info().globalMemCacheLineSize_);
// Set the workgroup information for the kernel
workGroupInfo_.availableLDSSize_ = device().info().localMemSizePerCU_;
+32 -24
Просмотреть файл
@@ -497,7 +497,8 @@ hsa_signal_t VirtualGPU::HwQueueTracker::ActiveSignal(
}
// ================================================================================================
std::vector<hsa_signal_t>& VirtualGPU::HwQueueTracker::WaitingSignal(HwQueueEngine engine) {
std::vector<hsa_signal_t>& VirtualGPU::HwQueueTracker::WaitingSignal(HwQueueEngine engine,
bool forceHostWait) {
bool explicit_wait = false;
// Reset all current waiting signals
waiting_signals_.clear();
@@ -545,8 +546,8 @@ std::vector<hsa_signal_t>& VirtualGPU::HwQueueTracker::WaitingSignal(HwQueueEngi
const Settings& settings = gpu_.dev().settings();
// Actively wait on CPU to avoid extra overheads of signal tracking on GPU.
// For small copies set forced wait
if (!WaitForSignal<true>(external_signals_[i]->signal_, false,
external_signals_[i]->flags_.forceHostWait_)) {
if (!WaitForSignal<true>(external_signals_[i]->signal_, false, forceHostWait ?
external_signals_[i]->flags_.forceHostWait_ : false)) {
if (settings.cpu_wait_for_signal_) {
// Wait on CPU for completion if requested
CpuWaitForSignal(external_signals_[i]);
@@ -713,6 +714,9 @@ bool VirtualGPU::processMemObjects(const amd::Kernel& kernel, const_address para
else {
uint32_t index = desc.info_.arrayIndex_;
mem = memories[index];
const void* globalAddress = *reinterpret_cast<const void* const*>(params + desc.offset_);
ClPrint(amd::LOG_INFO, amd::LOG_KERN,
"Arg%d: %s %s = ptr:%p", i, desc.typeName_.c_str(), desc.name_.c_str(), globalAddress);
if (mem == nullptr) {
//! This condition is for SVM fine-grain
if (dev().isFineGrainedSystem(true)) {
@@ -839,7 +843,7 @@ static inline void packet_store_release(uint32_t* packet, uint16_t header, uint1
// ================================================================================================
template <typename AqlPacket>
bool VirtualGPU::dispatchGenericAqlPacket(
AqlPacket* packet, uint16_t header, uint16_t rest, bool blocking) {
AqlPacket* packet, uint16_t header, uint16_t rest, bool blocking, bool attach_signal) {
const uint32_t queueSize = gpu_queue_->size;
const uint32_t queueMask = queueSize - 1;
const uint32_t sw_queue_size = queueMask;
@@ -847,6 +851,7 @@ bool VirtualGPU::dispatchGenericAqlPacket(
// Check for queue full and wait if needed.
uint64_t index = hsa_queue_add_write_index_screlease(gpu_queue_, 1);
uint64_t read = hsa_queue_load_read_index_relaxed(gpu_queue_);
if (addSystemScope_) {
header &= ~(HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE |
HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
@@ -858,15 +863,15 @@ bool VirtualGPU::dispatchGenericAqlPacket(
auto expected_fence_state = extractAqlBits(header, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE,
HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE);
if (fence_state_ == amd::Device::kCacheStateSystem &&
expected_fence_state == amd::Device::kCacheStateSystem) {
if (fence_state_ == amd::Device::kCacheStateSystem
&& expected_fence_state == amd::Device::kCacheStateSystem) {
header = dispatchPacketHeader_;
fence_dirty_ = true;
}
fence_state_ = static_cast<Device::CacheState>(expected_fence_state);
if (timestamp_ != nullptr) {
if (timestamp_ != nullptr || attach_signal) {
// Get active signal for current dispatch if profiling is necessary
packet->completion_signal = Barriers().ActiveSignal(kInitSignalValueOne, timestamp_);
@@ -967,7 +972,7 @@ void VirtualGPU::dispatchBlockingWait() {
// ================================================================================================
bool VirtualGPU::dispatchAqlPacket(hsa_kernel_dispatch_packet_t* packet, uint16_t header,
uint16_t rest, bool blocking, bool capturing,
const uint8_t* aqlPacket) {
const uint8_t* aqlPacket, bool attach_signal) {
if (capturing == true) {
packet->header = header;
packet->setup = rest;
@@ -975,13 +980,13 @@ bool VirtualGPU::dispatchAqlPacket(hsa_kernel_dispatch_packet_t* packet, uint16_
return true;
} else {
dispatchBlockingWait();
return dispatchGenericAqlPacket(packet, header, rest, blocking);
return dispatchGenericAqlPacket(packet, header, rest, blocking, attach_signal);
}
}
// ================================================================================================
bool VirtualGPU::dispatchAqlPacket(
hsa_barrier_and_packet_t* packet, uint16_t header, uint16_t rest, bool blocking) {
return dispatchGenericAqlPacket(packet, header, rest, blocking);
bool VirtualGPU::dispatchAqlPacket(hsa_barrier_and_packet_t* packet, uint16_t header, uint16_t rest,
bool blocking, bool attach_signal) {
return dispatchGenericAqlPacket(packet, header, rest, blocking, attach_signal);
}
// ================================================================================================
@@ -1074,10 +1079,9 @@ void VirtualGPU::dispatchBarrierPacket(uint16_t packetHeader, bool skipSignal,
barrier_packet_.completion_signal = signal;
}
// Reset fence_dirty_ and addSystemScope_ flag if we submit a barrier with system scopes
// Reset fence_dirty_ flag if we submit a barrier with system scopes
if (cache_state == amd::Device::kCacheStateSystem) {
fence_dirty_ = false;
addSystemScope_ = false;
}
while ((index - hsa_queue_load_read_index_scacquire(gpu_queue_)) >= queueMask);
@@ -1424,7 +1428,7 @@ bool VirtualGPU::ManagedBuffer::Create() {
// ================================================================================================
address VirtualGPU::ManagedBuffer::Acquire(uint32_t size) {
auto alignment = gpu_.dev().info().globalMemCacheLineSize_;
auto alignment = amd::alignUp(256u, gpu_.dev().info().globalMemCacheLineSize_);
address result = nullptr;
result = amd::alignUp(pool_base_ + pool_cur_offset_, alignment);
const size_t pool_new_usage = (result + size) - pool_base_;
@@ -1713,7 +1717,8 @@ void VirtualGPU::submitReadMemory(amd::ReadMemoryCommand& cmd) {
bool imageBuffer = false;
// Force buffer read for IMAGE1D_BUFFER
if ((type == CL_COMMAND_READ_IMAGE) && (cmd.source().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) {
if ((type == CL_COMMAND_READ_IMAGE) &&
(cmd.source().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) {
type = CL_COMMAND_READ_BUFFER;
imageBuffer = true;
}
@@ -1732,7 +1737,8 @@ void VirtualGPU::submitReadMemory(amd::ReadMemoryCommand& cmd) {
result = blitMgr().copyBuffer(*devMem, *hostMemory, origin, dstOrigin, size,
cmd.isEntireMemory(), cmd.copyMetadata());
} else {
result = blitMgr().readBuffer(*devMem, dst, origin, size, cmd.isEntireMemory(), cmd.copyMetadata());
result = blitMgr().readBuffer(*devMem, dst, origin, size,
cmd.isEntireMemory(), cmd.copyMetadata());
}
break;
}
@@ -1752,7 +1758,8 @@ void VirtualGPU::submitReadMemory(amd::ReadMemoryCommand& cmd) {
break;
}
case CL_COMMAND_READ_IMAGE: {
if ((cmd.source().parent() != nullptr) && (cmd.source().parent()->getType() == CL_MEM_OBJECT_BUFFER)) {
if ((cmd.source().parent() != nullptr) &&
(cmd.source().parent()->getType() == CL_MEM_OBJECT_BUFFER)) {
Image* imageBuffer = static_cast<Image*>(devMem);
// Check if synchronization has to be performed
if (nullptr != imageBuffer->CopyImageBuffer()) {
@@ -1772,7 +1779,8 @@ void VirtualGPU::submitReadMemory(amd::ReadMemoryCommand& cmd) {
amd::Coord3D dstOrigin(offset);
result =
blitMgr().copyImageToBuffer(*devMem, *hostMemory, cmd.origin(), dstOrigin, size,
cmd.isEntireMemory(), cmd.rowPitch(), cmd.slicePitch(), cmd.copyMetadata());
cmd.isEntireMemory(), cmd.rowPitch(),
cmd.slicePitch(), cmd.copyMetadata());
} else {
result = blitMgr().readImage(*devMem, dst, cmd.origin(), size, cmd.rowPitch(),
cmd.slicePitch(), cmd.isEntireMemory(), cmd.copyMetadata());
@@ -1839,7 +1847,8 @@ void VirtualGPU::submitWriteMemory(amd::WriteMemoryCommand& cmd) {
result = blitMgr().copyBuffer(*hostMemory, *devMem, srcOrigin, origin, size,
cmd.isEntireMemory(), cmd.copyMetadata());
} else {
result = blitMgr().writeBuffer(src, *devMem, origin, size, cmd.isEntireMemory(), cmd.copyMetadata());
result = blitMgr().writeBuffer(src, *devMem, origin, size,
cmd.isEntireMemory(), cmd.copyMetadata());
}
break;
}
@@ -3131,9 +3140,9 @@ void VirtualGPU::HiddenHeapInit() { const_cast<Device&>(dev()).HiddenHeapInit(*t
// ================================================================================================
bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
const amd::Kernel& kernel, const_address parameters, void* eventHandle,
const amd::Kernel& kernel, const_address parameters, void* event_handle,
uint32_t sharedMemBytes, amd::NDRangeKernelCommand* vcmd,
hsa_kernel_dispatch_packet_t* aql_packet) {
hsa_kernel_dispatch_packet_t* aql_packet, bool attach_signal) {
device::Kernel* devKernel = const_cast<device::Kernel*>(kernel.getDeviceKernel(dev()));
Kernel& gpuKernel = static_cast<Kernel&>(*devKernel);
size_t ldsUsage = gpuKernel.WorkgroupGroupSegmentByteSize();
@@ -3482,7 +3491,6 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
addSystemScope_ = true;
}
// Copy scheduler's AQL packet for possible relaunch from the scheduler itself
if (aql_packet != nullptr) {
*aql_packet = dispatchPacket;
@@ -3504,7 +3512,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
} else {
if (!dispatchAqlPacket(&dispatchPacket, aqlHeaderWithOrder,
(sizes.dimensions() << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS),
GPU_FLUSH_ON_EXECUTION)) {
GPU_FLUSH_ON_EXECUTION, false, nullptr, attach_signal)) {
return false;
}
}
+13 -7
Просмотреть файл
@@ -30,6 +30,7 @@
#include "rocprintf.hpp"
#include "hsa/hsa_ven_amd_aqlprofile.h"
#include "rocsched.hpp"
#include "device/device.hpp"
namespace amd::roc {
class Device;
@@ -270,7 +271,8 @@ class VirtualGPU : public device::VirtualDevice {
HwQueueEngine GetActiveEngine() const { return engine_; }
//! Returns the last submitted signal for a wait
std::vector<hsa_signal_t>& WaitingSignal(HwQueueEngine engine = HwQueueEngine::Compute);
std::vector<hsa_signal_t>& WaitingSignal(HwQueueEngine engine = HwQueueEngine::Compute,
bool forceHostWait = true);
//! Resets current signal back to the previous one. It's necessary in a case of ROCr failure.
void ResetCurrentSignal();
@@ -341,8 +343,8 @@ class VirtualGPU : public device::VirtualDevice {
void* event_handle, //!< Handle to OCL event for debugging
uint32_t sharedMemBytes = 0, //!< Shared memory size
amd::NDRangeKernelCommand* vcmd = nullptr, //!< Original launch command
hsa_kernel_dispatch_packet_t* aql_packet = nullptr //!< Scheduler launch
);
hsa_kernel_dispatch_packet_t* aql_packet = nullptr, //!< Scheduler launch
bool attach_signal = false);
void submitNativeFn(amd::NativeFnCommand& cmd);
void submitMarker(amd::Marker& cmd);
void submitAccumulate(amd::AccumulateCommand& cmd);
@@ -420,7 +422,10 @@ class VirtualGPU : public device::VirtualDevice {
void hasPendingDispatch() { hasPendingDispatch_ = true; }
bool IsPendingDispatch() const { return (hasPendingDispatch_) ? true : false; }
void addSystemScope() { addSystemScope_ = true; }
void addSystemScope() {
addSystemScope_ = true;
fence_state_ = amd::Device::CacheState::kCacheStateInvalid;
}
void SetCopyCommandType(cl_command_type type) { copy_command_type_ = type; }
HwQueueTracker& Barriers() { return barriers_; }
@@ -444,11 +449,12 @@ class VirtualGPU : public device::VirtualDevice {
amd::AccumulateCommand* vcmd = nullptr);
bool dispatchAqlPacket(hsa_kernel_dispatch_packet_t* packet, uint16_t header, uint16_t rest,
bool blocking = true, bool capturing = false,
const uint8_t* aqlPacket = nullptr);
const uint8_t* aqlPacket = nullptr, bool attach_signal = false);
bool dispatchAqlPacket(hsa_barrier_and_packet_t* packet, uint16_t header,
uint16_t rest, bool blocking = true);
uint16_t rest, bool blocking = true, bool attach_signal = false);
template <typename AqlPacket> bool dispatchGenericAqlPacket(AqlPacket* packet, uint16_t header,
uint16_t rest, bool blocking);
uint16_t rest, bool blocking,
bool attach_signal = false);
bool dispatchCounterAqlPacket(hsa_ext_amd_aql_pm4_packet_t* packet, const uint32_t gfxVersion,
bool blocking, const hsa_ven_amd_aqlprofile_1_00_pfn_t* extApi);
+3 -3
Просмотреть файл
@@ -85,7 +85,7 @@ release(size_t, GPU_PINNED_MIN_XFER_SIZE, 128, \
release(size_t, GPU_RESOURCE_CACHE_SIZE, 64, \
"The resource cache size in MB") \
release(size_t, GPU_MAX_SUBALLOC_SIZE, 4096, \
"The maximum size accepted for suballocaitons in KB") \
"The maximum size accepted for suballocations in KB") \
release(size_t, GPU_NUM_MEM_DEPENDENCY, 256, \
"Number of memory objects for dependency tracking") \
release(size_t, GPU_XFER_BUFFER_SIZE, 0, \
@@ -105,7 +105,7 @@ release(bool, GPU_USE_DEVICE_QUEUE, false, \
release(bool, AMD_THREAD_TRACE_ENABLE, true, \
"Enable thread trace extension") \
release(uint, OPENCL_VERSION, 200, \
"Force GPU opencl verison") \
"Force GPU opencl version") \
release(bool, HSA_LOCAL_MEMORY_ENABLE, true, \
"Enable HSA device local memory usage") \
release(uint, HSA_KERNARG_POOL_SIZE, 1024 * 1024, \
@@ -186,7 +186,7 @@ release(bool, AMD_DIRECT_DISPATCH, false, \
release(uint, HIP_HIDDEN_FREE_MEM, 0, \
"Reserve free mem reporting in Mb" \
"0 = Disable") \
release(size_t, GPU_FORCE_BLIT_COPY_SIZE, 0, \
release(size_t, GPU_FORCE_BLIT_COPY_SIZE, 16, \
"Use Blit until this size(in KB) for copies") \
release(uint, ROC_ACTIVE_WAIT_TIMEOUT, 0, \
"Forces active wait of GPU interrup for the timeout(us)") \