SWDEV-497145 - Use rocr copyOnEngine API for staged copies
- Refactor blit code and clean ASAN instrumentation
- Use unified function for rocr copy
- Enable shader copy path for unpinned writeBuffer/readBuffer paths
- Set GPU_FORCE_BLIT_COPY_SIZE=16 which means we will use BLIT copy for
pinned copies or unpinned H2D/D2H copies < 16KB
Change-Id: I42045cca79234b340dbf53dafb93044199736ae4
[ROCm/clr commit: 7863eb92dc]
Этот коммит содержится в:
@@ -48,21 +48,6 @@ inline Memory& DmaBlitManager::gpuMem(device::Memory& mem) const {
|
||||
return static_cast<Memory&>(mem);
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
bool DmaBlitManager::readMemoryStaged(Memory& srcMemory, void* dstHost, Memory& xferBuf,
|
||||
size_t origin, size_t& offset, size_t& totalSize,
|
||||
size_t xferSize) const {
|
||||
const_address src = srcMemory.getDeviceMemory();
|
||||
address staging = xferBuf.getDeviceMemory();
|
||||
|
||||
// Copy data from device to host
|
||||
src += origin + offset;
|
||||
address dst = reinterpret_cast<address>(dstHost) + offset;
|
||||
bool ret = hsaCopyStaged(src, dst, totalSize, staging, false);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
bool DmaBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost,
|
||||
const amd::Coord3D& origin, const amd::Coord3D& size,
|
||||
@@ -77,77 +62,15 @@ bool DmaBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost,
|
||||
gpu().Barriers().WaitCurrent();
|
||||
return HostBlitManager::readBuffer(srcMemory, dstHost, origin, size, entire, copyMetadata);
|
||||
} else {
|
||||
size_t srcSize = size[0];
|
||||
size_t offset = 0;
|
||||
size_t pinSize = dev().settings().pinnedXferSize_;
|
||||
pinSize = std::min(pinSize, srcSize);
|
||||
|
||||
// Check if a pinned transfer can be executed
|
||||
if (pinSize && (srcSize > MinSizeForPinnedTransfer)) {
|
||||
// Align offset to 4K boundary
|
||||
char* tmpHost = const_cast<char*>(
|
||||
amd::alignDown(reinterpret_cast<const char*>(dstHost), PinnedMemoryAlignment));
|
||||
|
||||
// Find the partial size for unaligned copy
|
||||
size_t partial = reinterpret_cast<const char*>(dstHost) - tmpHost;
|
||||
|
||||
amd::Memory* pinned = nullptr;
|
||||
bool first = true;
|
||||
size_t tmpSize;
|
||||
size_t pinAllocSize;
|
||||
|
||||
// Copy memory, using pinning
|
||||
while (srcSize > 0) {
|
||||
// If it's the first iterarion, then readjust the copy size
|
||||
// to include alignment
|
||||
if (first) {
|
||||
pinAllocSize = amd::alignUp(pinSize + partial, PinnedMemoryAlignment);
|
||||
tmpSize = std::min(pinAllocSize - partial, srcSize);
|
||||
first = false;
|
||||
} else {
|
||||
tmpSize = std::min(pinSize, srcSize);
|
||||
pinAllocSize = amd::alignUp(tmpSize, PinnedMemoryAlignment);
|
||||
partial = 0;
|
||||
}
|
||||
amd::Coord3D dst(partial, 0, 0);
|
||||
amd::Coord3D srcPin(origin[0] + offset, 0, 0);
|
||||
amd::Coord3D copySizePin(tmpSize, 0, 0);
|
||||
size_t partial2;
|
||||
|
||||
// Allocate a GPU resource for pinning
|
||||
pinned = pinHostMemory(tmpHost, pinAllocSize, partial2);
|
||||
if (pinned != nullptr) {
|
||||
// Get device memory for this virtual device
|
||||
Memory* dstMemory = dev().getRocMemory(pinned);
|
||||
const KernelBlitManager *kb = dynamic_cast<const KernelBlitManager*>(this);
|
||||
if (!kb->copyBuffer(gpuMem(srcMemory), *dstMemory, srcPin, dst,
|
||||
copySizePin)) {
|
||||
LogWarning("DmaBlitManager::readBuffer failed a pinned copy!");
|
||||
gpu().addPinnedMem(pinned);
|
||||
break;
|
||||
}
|
||||
gpu().addPinnedMem(pinned);
|
||||
} else {
|
||||
LogWarning("DmaBlitManager::readBuffer failed to pin a resource!");
|
||||
break;
|
||||
}
|
||||
srcSize -= tmpSize;
|
||||
offset += tmpSize;
|
||||
tmpHost = reinterpret_cast<char*>(tmpHost) + tmpSize + partial;
|
||||
}
|
||||
}
|
||||
|
||||
if (0 != srcSize) {
|
||||
Memory& xferBuf = dev().xferRead().acquire();
|
||||
|
||||
// Read memory using a staging resource
|
||||
if (!readMemoryStaged(gpuMem(srcMemory), dstHost, xferBuf, origin[0], offset, srcSize,
|
||||
srcSize)) {
|
||||
LogError("DmaBlitManager::readBuffer failed!");
|
||||
size_t copySize = size[0];
|
||||
if (0 != copySize) {
|
||||
const_address addrSrc = gpuMem(srcMemory).getDeviceMemory() + origin[0];
|
||||
address addrDst = reinterpret_cast<address>(dstHost);
|
||||
constexpr bool kHostToDev = false;
|
||||
if(!hsaCopyStaged(addrSrc, addrDst, copySize, kHostToDev, copyMetadata)) {
|
||||
LogError("DmaBlitManager::readBuffer staged copy failed!");
|
||||
return false;
|
||||
}
|
||||
|
||||
dev().xferRead().release(gpu(), xferBuf);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -170,8 +93,6 @@ bool DmaBlitManager::readBufferRect(device::Memory& srcMemory, void* dstHost,
|
||||
gpu().Barriers().WaitCurrent();
|
||||
return HostBlitManager::readBufferRect(srcMemory, dstHost, bufRect, hostRect, size, entire, copyMetadata);
|
||||
} else {
|
||||
Memory& xferBuf = dev().xferRead().acquire();
|
||||
address staging = xferBuf.getDeviceMemory();
|
||||
const_address src = gpuMem(srcMemory).getDeviceMemory();
|
||||
|
||||
size_t srcOffset;
|
||||
@@ -184,13 +105,12 @@ bool DmaBlitManager::readBufferRect(device::Memory& srcMemory, void* dstHost,
|
||||
|
||||
// Copy data from device to host - line by line
|
||||
address dst = reinterpret_cast<address>(dstHost) + dstOffset;
|
||||
bool retval = hsaCopyStaged(src + srcOffset, dst, size[0], staging, false);
|
||||
bool retval = hsaCopyStaged(src + srcOffset, dst, size[0], false, copyMetadata);
|
||||
if (!retval) {
|
||||
return retval;
|
||||
}
|
||||
}
|
||||
}
|
||||
dev().xferRead().release(gpu(), xferBuf);
|
||||
}
|
||||
|
||||
return true;
|
||||
@@ -216,20 +136,6 @@ bool DmaBlitManager::readImage(device::Memory& srcMemory, void* dstHost,
|
||||
return true;
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
bool DmaBlitManager::writeMemoryStaged(const void* srcHost, Memory& dstMemory, address staging,
|
||||
size_t origin, size_t& offset, size_t& totalSize,
|
||||
size_t xferSize) const {
|
||||
address dst = dstMemory.getDeviceMemory();
|
||||
|
||||
// Copy data from host to device
|
||||
dst += origin + offset;
|
||||
const_address src = reinterpret_cast<const_address>(srcHost) + offset;
|
||||
bool retval = hsaCopyStaged(src, dst, totalSize, staging, true);
|
||||
|
||||
return retval;
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
bool DmaBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemory,
|
||||
const amd::Coord3D& origin, const amd::Coord3D& size,
|
||||
@@ -241,79 +147,17 @@ bool DmaBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemory,
|
||||
gpu().releaseGpuMemoryFence();
|
||||
return HostBlitManager::writeBuffer(srcHost, dstMemory, origin, size, entire, copyMetadata);
|
||||
} else {
|
||||
// HSA copy functionality with a possible async operation
|
||||
gpu().releaseGpuMemoryFence(kSkipCpuWait);
|
||||
|
||||
size_t dstSize = size[0];
|
||||
size_t tmpSize = 0;
|
||||
size_t offset = 0;
|
||||
size_t pinSize = dev().settings().pinnedXferSize_;
|
||||
pinSize = std::min(pinSize, dstSize);
|
||||
|
||||
// Check if a pinned transfer can be executed
|
||||
if (pinSize && (dstSize > MinSizeForPinnedTransfer)) {
|
||||
// Align offset to 4K boundary
|
||||
char* tmpHost = const_cast<char*>(
|
||||
amd::alignDown(reinterpret_cast<const char*>(srcHost), PinnedMemoryAlignment));
|
||||
|
||||
// Find the partial size for unaligned copy
|
||||
size_t partial = reinterpret_cast<const char*>(srcHost) - tmpHost;
|
||||
|
||||
amd::Memory* pinned = nullptr;
|
||||
bool first = true;
|
||||
size_t tmpSize;
|
||||
size_t pinAllocSize;
|
||||
|
||||
// Copy memory, using pinning
|
||||
while (dstSize > 0) {
|
||||
// If it's the first iterarion, then readjust the copy size
|
||||
// to include alignment
|
||||
if (first) {
|
||||
pinAllocSize = amd::alignUp(pinSize + partial, PinnedMemoryAlignment);
|
||||
tmpSize = std::min(pinAllocSize - partial, dstSize);
|
||||
first = false;
|
||||
} else {
|
||||
tmpSize = std::min(pinSize, dstSize);
|
||||
pinAllocSize = amd::alignUp(tmpSize, PinnedMemoryAlignment);
|
||||
partial = 0;
|
||||
}
|
||||
amd::Coord3D src(partial, 0, 0);
|
||||
amd::Coord3D dstPin(origin[0] + offset, 0, 0);
|
||||
amd::Coord3D copySizePin(tmpSize, 0, 0);
|
||||
size_t partial2;
|
||||
|
||||
// Allocate a GPU resource for pinning
|
||||
pinned = pinHostMemory(tmpHost, pinAllocSize, partial2);
|
||||
|
||||
if (pinned != nullptr) {
|
||||
// Get device memory for this virtual device
|
||||
Memory* srcMemory = dev().getRocMemory(pinned);
|
||||
const KernelBlitManager *kb = dynamic_cast<const KernelBlitManager*>(this);
|
||||
if (!kb->copyBuffer(*srcMemory, gpuMem(dstMemory), src, dstPin,
|
||||
copySizePin)) {
|
||||
LogWarning("DmaBlitManager::writeBuffer failed a pinned copy!");
|
||||
gpu().addPinnedMem(pinned);
|
||||
break;
|
||||
}
|
||||
gpu().addPinnedMem(pinned);
|
||||
} else {
|
||||
LogWarning("DmaBlitManager::writeBuffer failed to pin a resource!");
|
||||
break;
|
||||
}
|
||||
dstSize -= tmpSize;
|
||||
offset += tmpSize;
|
||||
tmpHost = reinterpret_cast<char*>(tmpHost) + tmpSize + partial;
|
||||
}
|
||||
}
|
||||
|
||||
if (dstSize != 0) {
|
||||
address staging = gpu().Staging().Acquire(
|
||||
std::min(dstSize, dev().settings().stagedXferSize_));
|
||||
size_t copySize = size[0];
|
||||
|
||||
// For small copies use managed staging buffers which can be non blocking
|
||||
if (copySize != 0) {
|
||||
address dstAddr = gpuMem(dstMemory).getDeviceMemory() + origin[0];
|
||||
const_address srcAddr = reinterpret_cast<const_address>(srcHost);
|
||||
// Write memory using a staging resource
|
||||
if (!writeMemoryStaged(srcHost, gpuMem(dstMemory), staging, origin[0], offset, dstSize,
|
||||
dstSize)) {
|
||||
LogError("DmaBlitManager::writeBuffer failed!");
|
||||
constexpr bool kHostToDev = true;
|
||||
bool result = hsaCopyStaged(srcAddr, dstAddr, copySize, kHostToDev, copyMetadata);
|
||||
if (!result) {
|
||||
LogError("DmaBlitManager::writeBuffer staging copy failed!");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
@@ -336,8 +180,6 @@ bool DmaBlitManager::writeBufferRect(const void* srcHost, device::Memory& dstMem
|
||||
return HostBlitManager::writeBufferRect(srcHost, dstMemory, hostRect, bufRect, size, entire,
|
||||
copyMetadata);
|
||||
} else {
|
||||
address staging = gpu().Staging().Acquire(
|
||||
std::min(size[0], dev().settings().stagedXferSize_));
|
||||
address dst = static_cast<roc::Memory&>(dstMemory).getDeviceMemory();
|
||||
|
||||
size_t srcOffset;
|
||||
@@ -350,7 +192,8 @@ bool DmaBlitManager::writeBufferRect(const void* srcHost, device::Memory& dstMem
|
||||
|
||||
// Copy data from host to device - line by line
|
||||
const_address src = reinterpret_cast<const_address>(srcHost) + srcOffset;
|
||||
bool retval = hsaCopyStaged(src, dst + dstOffset, size[0], staging, true);
|
||||
constexpr bool kHostToDev = true;
|
||||
bool retval = hsaCopyStaged(src, dst + dstOffset, size[0], kHostToDev, copyMetadata);
|
||||
if (!retval) {
|
||||
return retval;
|
||||
}
|
||||
@@ -632,43 +475,10 @@ bool DmaBlitManager::copyImage(device::Memory& srcMemory, device::Memory& dstMem
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
bool DmaBlitManager::hsaCopy(const Memory& srcMemory, const Memory& dstMemory,
|
||||
const amd::Coord3D& srcOrigin, const amd::Coord3D& dstOrigin,
|
||||
const amd::Coord3D& size, amd::CopyMetadata copyMetadata) const {
|
||||
address src = reinterpret_cast<address>(srcMemory.getDeviceMemory());
|
||||
address dst = reinterpret_cast<address>(dstMemory.getDeviceMemory());
|
||||
|
||||
gpu().releaseGpuMemoryFence(kSkipCpuWait);
|
||||
|
||||
src += srcOrigin[0];
|
||||
dst += dstOrigin[0];
|
||||
|
||||
// Just call copy function for full profile
|
||||
inline bool DmaBlitManager::rocrCopyBuffer(address dst, hsa_agent_t& dstAgent,
|
||||
const_address src, hsa_agent_t& srcAgent, size_t size,
|
||||
amd::CopyMetadata& copyMetadata) const {
|
||||
hsa_status_t status = HSA_STATUS_SUCCESS;
|
||||
if (dev().agent_profile() == HSA_PROFILE_FULL) {
|
||||
// Stall GPU, sicne CPU copy is possible
|
||||
gpu().Barriers().WaitCurrent();
|
||||
status = hsa_memory_copy(dst, src, size[0]);
|
||||
if (status != HSA_STATUS_SUCCESS) {
|
||||
LogPrintfError("Hsa copy of data failed with code %d", status);
|
||||
}
|
||||
return (status == HSA_STATUS_SUCCESS);
|
||||
}
|
||||
|
||||
hsa_agent_t srcAgent;
|
||||
hsa_agent_t dstAgent;
|
||||
|
||||
if (&srcMemory.dev() == &dstMemory.dev()) {
|
||||
// Detect the agents for memory allocations
|
||||
srcAgent =
|
||||
(srcMemory.isHostMemDirectAccess()) ? dev().getCpuAgent() : dev().getBackendDevice();
|
||||
dstAgent =
|
||||
(dstMemory.isHostMemDirectAccess()) ? dev().getCpuAgent() : dev().getBackendDevice();
|
||||
}
|
||||
else {
|
||||
srcAgent = srcMemory.dev().getBackendDevice();
|
||||
dstAgent = dstMemory.dev().getBackendDevice();
|
||||
}
|
||||
|
||||
uint32_t copyMask = 0;
|
||||
uint32_t freeEngineMask = 0;
|
||||
@@ -707,9 +517,11 @@ bool DmaBlitManager::hsaCopy(const Memory& srcMemory, const Memory& dstMemory,
|
||||
}
|
||||
|
||||
// Check if host wait has to be forced
|
||||
bool forceHostWait = forceHostWaitFunc(size[0]);
|
||||
bool forceHostWait = forceHostWaitFunc(size);
|
||||
|
||||
auto wait_events = gpu().Barriers().WaitingSignal(engine);
|
||||
constexpr bool kIgnoreHostWait = false;
|
||||
// Ignore waiting on any previous kernel dispatch and queue a signal to ROCr copy api instead
|
||||
auto wait_events = gpu().Barriers().WaitingSignal(engine, kIgnoreHostWait);
|
||||
hsa_signal_t active = gpu().Barriers().ActiveSignal(kInitSignalValueOne, gpu().timestamp(),
|
||||
forceHostWait);
|
||||
|
||||
@@ -740,11 +552,11 @@ bool DmaBlitManager::hsaCopy(const Memory& srcMemory, const Memory& dstMemory,
|
||||
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY,
|
||||
"HSA Async Copy on copy_engine=0x%x, dst=0x%zx, src=0x%zx, "
|
||||
"size=%ld, forceSDMA=%d, wait_event=0x%zx, completion_signal=0x%zx", copyEngine,
|
||||
dst, src, size[0], forceSDMA, (wait_events.size() != 0) ? wait_events[0].handle : 0,
|
||||
dst, src, size, forceSDMA, (wait_events.size() != 0) ? wait_events[0].handle : 0,
|
||||
active.handle);
|
||||
|
||||
status = hsa_amd_memory_async_copy_on_engine(dst, dstAgent, src, srcAgent,
|
||||
size[0], wait_events.size(),
|
||||
size, wait_events.size(),
|
||||
wait_events.data(), active, copyEngine,
|
||||
forceSDMA);
|
||||
} else {
|
||||
@@ -756,11 +568,11 @@ bool DmaBlitManager::hsaCopy(const Memory& srcMemory, const Memory& dstMemory,
|
||||
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY,
|
||||
"HSA Async Copy dst=0x%zx, src=0x%zx, size=%ld, wait_event=0x%zx, "
|
||||
"completion_signal=0x%zx",
|
||||
dst, src, size[0], (wait_events.size() != 0) ? wait_events[0].handle : 0,
|
||||
dst, src, size, (wait_events.size() != 0) ? wait_events[0].handle : 0,
|
||||
active.handle);
|
||||
|
||||
status = hsa_amd_memory_async_copy(dst, dstAgent, src, srcAgent,
|
||||
size[0], wait_events.size(), wait_events.data(), active);
|
||||
size, wait_events.size(), wait_events.data(), active);
|
||||
}
|
||||
|
||||
if (status == HSA_STATUS_SUCCESS) {
|
||||
@@ -773,93 +585,104 @@ bool DmaBlitManager::hsaCopy(const Memory& srcMemory, const Memory& dstMemory,
|
||||
return (status == HSA_STATUS_SUCCESS);
|
||||
}
|
||||
|
||||
|
||||
// ================================================================================================
|
||||
bool DmaBlitManager::hsaCopy(const Memory& srcMemory, const Memory& dstMemory,
|
||||
const amd::Coord3D& srcOrigin, const amd::Coord3D& dstOrigin,
|
||||
const amd::Coord3D& size, amd::CopyMetadata& copyMetadata) const {
|
||||
address src = reinterpret_cast<address>(srcMemory.getDeviceMemory());
|
||||
address dst = reinterpret_cast<address>(dstMemory.getDeviceMemory());
|
||||
|
||||
gpu().releaseGpuMemoryFence(kSkipCpuWait);
|
||||
|
||||
src += srcOrigin[0];
|
||||
dst += dstOrigin[0];
|
||||
|
||||
hsa_agent_t srcAgent;
|
||||
hsa_agent_t dstAgent;
|
||||
|
||||
if (&srcMemory.dev() == &dstMemory.dev()) {
|
||||
// Detect the agents for memory allocations
|
||||
srcAgent =
|
||||
(srcMemory.isHostMemDirectAccess()) ? dev().getCpuAgent() : dev().getBackendDevice();
|
||||
dstAgent =
|
||||
(dstMemory.isHostMemDirectAccess()) ? dev().getCpuAgent() : dev().getBackendDevice();
|
||||
}
|
||||
else {
|
||||
srcAgent = srcMemory.dev().getBackendDevice();
|
||||
dstAgent = dstMemory.dev().getBackendDevice();
|
||||
}
|
||||
|
||||
return rocrCopyBuffer(dst, dstAgent, src, srcAgent, size[0], copyMetadata);
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
bool DmaBlitManager::hsaCopyStaged(const_address hostSrc, address hostDst, size_t size,
|
||||
address staging, bool hostToDev) const {
|
||||
bool hostToDev, amd::CopyMetadata& copyMetadata) const {
|
||||
// Stall GPU, sicne CPU copy is possible
|
||||
gpu().releaseGpuMemoryFence(hostToDev);
|
||||
|
||||
// No allocation is necessary for Full Profile
|
||||
hsa_status_t status;
|
||||
if (dev().agent_profile() == HSA_PROFILE_FULL) {
|
||||
status = hsa_memory_copy(hostDst, hostSrc, size);
|
||||
if (status != HSA_STATUS_SUCCESS) {
|
||||
LogPrintfError("Hsa copy of data failed with code %d", status);
|
||||
}
|
||||
return (status == HSA_STATUS_SUCCESS);
|
||||
}
|
||||
|
||||
size_t totalSize = size;
|
||||
size_t offset = 0;
|
||||
size_t stagedCopyOffset = 0;
|
||||
bool status = true;
|
||||
Memory* xferBuf = nullptr;
|
||||
address stagingBuffer = 0;
|
||||
size_t maxStagedXferSize = dev().settings().stagedXferSize_;
|
||||
|
||||
address hsaBuffer = staging;
|
||||
if (!hostToDev) {
|
||||
// Get static staging buffer as we need to wait until copy on GPU completes to copy
|
||||
// it back to the unpinned buffer
|
||||
xferBuf = &dev().xferRead().acquire();
|
||||
stagingBuffer = xferBuf->getDeviceMemory();
|
||||
}
|
||||
|
||||
// Allocate requested size of memory
|
||||
while (totalSize > 0) {
|
||||
size = std::min(totalSize, dev().settings().stagedXferSize_);
|
||||
size = std::min(totalSize, maxStagedXferSize);
|
||||
|
||||
hsa_agent_t srcAgent;
|
||||
hsa_agent_t dstAgent;
|
||||
|
||||
// Copy data from Host to Device
|
||||
if (hostToDev) {
|
||||
const hsa_agent_t srcAgent = dev().getCpuAgent();
|
||||
hsa_agent_t srcAgent = dev().getCpuAgent();
|
||||
hsa_agent_t dstAgent = dev().getBackendDevice();
|
||||
|
||||
HwQueueEngine engine = HwQueueEngine::Unknown;
|
||||
if (srcAgent.handle == dev().getBackendDevice().handle) {
|
||||
engine = HwQueueEngine::SdmaWrite;
|
||||
// Get an address from managed staging buffer
|
||||
stagingBuffer = gpu().Staging().Acquire(std::min(size, maxStagedXferSize));
|
||||
|
||||
address dst = hostDst + stagedCopyOffset;
|
||||
memcpy(stagingBuffer, hostSrc + stagedCopyOffset, size);
|
||||
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "HSA Async Copy staged H2D");
|
||||
status = rocrCopyBuffer(dst, dstAgent, stagingBuffer, srcAgent, size, copyMetadata);
|
||||
if (!status) {
|
||||
break;
|
||||
}
|
||||
gpu().Barriers().SetActiveEngine(engine);
|
||||
auto wait_events = gpu().Barriers().WaitingSignal(engine);
|
||||
hsa_signal_t active = gpu().Barriers().ActiveSignal(kInitSignalValueOne, gpu().timestamp());
|
||||
|
||||
memcpy(hsaBuffer, hostSrc + offset, size);
|
||||
status = hsa_amd_memory_async_copy(
|
||||
hostDst + offset, dev().getBackendDevice(), hsaBuffer, srcAgent, size,
|
||||
wait_events.size(), wait_events.data(), active);
|
||||
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY,
|
||||
"HSA Async Copy staged H2D dst=0x%zx, src=0x%zx, size=%ld, completion_signal=0x%zx",
|
||||
hostDst + offset, hsaBuffer, size, active.handle);
|
||||
|
||||
if (status != HSA_STATUS_SUCCESS) {
|
||||
gpu().Barriers().ResetCurrentSignal();
|
||||
LogPrintfError("Hsa copy from host to device failed with code %d", status);
|
||||
return false;
|
||||
}
|
||||
totalSize -= size;
|
||||
if (totalSize > 0) {
|
||||
// Wait if there are extra copies, which don't fit in a single staging buffer
|
||||
gpu().Barriers().WaitCurrent();
|
||||
}
|
||||
offset += size;
|
||||
continue;
|
||||
}
|
||||
|
||||
const hsa_agent_t dstAgent = dev().getCpuAgent();
|
||||
|
||||
HwQueueEngine engine = HwQueueEngine::Unknown;
|
||||
if (dstAgent.handle == dev().getBackendDevice().handle) {
|
||||
engine = HwQueueEngine::SdmaRead;
|
||||
}
|
||||
gpu().Barriers().SetActiveEngine(engine);
|
||||
auto wait_events = gpu().Barriers().WaitingSignal(engine);
|
||||
hsa_signal_t active = gpu().Barriers().ActiveSignal(kInitSignalValueOne, gpu().timestamp());
|
||||
|
||||
// Copy data from Device to Host
|
||||
status = hsa_amd_memory_async_copy(
|
||||
hsaBuffer, dstAgent, hostSrc + offset, dev().getBackendDevice(), size,
|
||||
wait_events.size(), wait_events.data(), active);
|
||||
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY,
|
||||
"HSA Async Copy staged D2H dst=0x%zx, src=0x%zx, size=%ld, completion_signal=0x%zx",
|
||||
hsaBuffer, hostSrc + offset, size, active.handle);
|
||||
|
||||
if (status == HSA_STATUS_SUCCESS) {
|
||||
gpu().Barriers().WaitCurrent();
|
||||
memcpy(hostDst + offset, hsaBuffer, size);
|
||||
} else {
|
||||
gpu().Barriers().ResetCurrentSignal();
|
||||
LogPrintfError("Hsa copy from device to host failed with code %d", status);
|
||||
return false;
|
||||
dstAgent = dev().getCpuAgent();
|
||||
srcAgent = dev().getBackendDevice();
|
||||
|
||||
const_address src = static_cast<const_address>(hostSrc) + stagedCopyOffset;
|
||||
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "HSA Async Copy staged D2H");
|
||||
status = rocrCopyBuffer(stagingBuffer, dstAgent, src, srcAgent, size, copyMetadata);
|
||||
if (status) {
|
||||
gpu().Barriers().WaitCurrent();
|
||||
memcpy(hostDst + stagedCopyOffset, stagingBuffer, size);
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
totalSize -= size;
|
||||
offset += size;
|
||||
stagedCopyOffset += size;
|
||||
}
|
||||
|
||||
if (!hostToDev) {
|
||||
dev().xferRead().release(gpu(), *xferBuf);
|
||||
}
|
||||
|
||||
if (!status) {
|
||||
return false;
|
||||
}
|
||||
|
||||
gpu().addSystemScope();
|
||||
@@ -1829,13 +1652,13 @@ bool KernelBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost,
|
||||
synchronize();
|
||||
return result;
|
||||
} else {
|
||||
size_t pinSize = size[0];
|
||||
size_t totalSize = size[0];
|
||||
|
||||
// Check if a pinned transfer can be executed with a single pin
|
||||
|
||||
if (((pinSize <= dev().settings().pinnedXferSize_) && (pinSize > MinSizeForPinnedTransfer))) {
|
||||
if (((totalSize <= dev().settings().pinnedXferSize_) &&
|
||||
(totalSize > MinSizeForPinnedTransfer))) {
|
||||
size_t partial;
|
||||
amd::Memory* amdMemory = pinHostMemory(dstHost, pinSize, partial);
|
||||
amd::Memory* amdMemory = pinHostMemory(dstHost, totalSize, partial);
|
||||
|
||||
if (amdMemory == nullptr) {
|
||||
// Force SW copy
|
||||
@@ -1857,7 +1680,55 @@ bool KernelBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost,
|
||||
// Add pinned memory for a later release
|
||||
gpu().addPinnedMem(amdMemory);
|
||||
} else {
|
||||
result = DmaBlitManager::readBuffer(srcMemory, dstHost, origin, size, entire, copyMetadata);
|
||||
// Do a staging copy
|
||||
bool useShaderCopyPath = setup_.disableHwlCopyBuffer_ ||
|
||||
(totalSize <= dev().settings().sdmaCopyThreshold_) ||
|
||||
(copyMetadata.copyEnginePreference_ ==
|
||||
amd::CopyMetadata::CopyEnginePreference::BLIT);
|
||||
|
||||
if (!useShaderCopyPath) {
|
||||
// HSA copy using a staging resource
|
||||
result = DmaBlitManager::readBuffer(srcMemory, dstHost, origin, size,
|
||||
entire, copyMetadata);
|
||||
}
|
||||
if (!result) {
|
||||
// Blit copy using a staging resource
|
||||
address srcAddr = gpuMem(srcMemory).getDeviceMemory();
|
||||
address dstAddr = reinterpret_cast<address>(dstHost);
|
||||
amd::Coord3D dstOrigin(0, 0, 0);
|
||||
size_t copySize = 0;
|
||||
size_t stagedCopyOffset = 0;
|
||||
size_t maxStagedXferSize = dev().settings().stagedXferSize_;
|
||||
Memory& xferBuf = dev().xferRead().acquire();
|
||||
address xferBufAddr = xferBuf.getDeviceMemory();
|
||||
|
||||
constexpr bool kAttachSignal = true;
|
||||
while (totalSize > 0) {
|
||||
copySize = std::min(totalSize, maxStagedXferSize);
|
||||
srcAddr += stagedCopyOffset;
|
||||
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "Blit staging D2H copy stg buf=%p, src=%p, "
|
||||
"dstOrigin=%zu, size=%zu", xferBufAddr, srcAddr, dstOrigin[0], copySize);
|
||||
// Flush caches for coherency after the copy as we need to std::memcpy
|
||||
// from staging buffer to unpinned dst. Also attach a signal to the dispatch packet
|
||||
// itself that we can wait on without extra barrier packet.
|
||||
gpu().addSystemScope();
|
||||
result = shaderCopyBuffer(xferBufAddr, srcAddr, dstOrigin, origin, copySize,
|
||||
entire, dev().settings().limit_blit_wg_, copyMetadata,
|
||||
kAttachSignal);
|
||||
if (!result) {
|
||||
break;
|
||||
}
|
||||
// Wait on current signal of previous blit copy
|
||||
gpu().Barriers().WaitCurrent();
|
||||
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "memcpy host dst=%p, stg buf=%p, size=%zu",
|
||||
(void*)(dstAddr + stagedCopyOffset), xferBufAddr, copySize);
|
||||
memcpy(dstAddr + stagedCopyOffset, xferBufAddr, copySize);
|
||||
totalSize -= copySize;
|
||||
stagedCopyOffset += copySize;
|
||||
}
|
||||
|
||||
dev().xferRead().release(gpu(), xferBuf);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1934,16 +1805,20 @@ bool KernelBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemo
|
||||
synchronize();
|
||||
return result;
|
||||
} else {
|
||||
size_t pinSize = size[0];
|
||||
|
||||
size_t totalSize = size[0];
|
||||
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "Unpinned write path");
|
||||
// If size > min pinned size, do a pinning copy, since we are limited by staging buffer size
|
||||
// Check if a pinned transfer can be executed with a single pin
|
||||
if ((pinSize <= dev().settings().pinnedXferSize_) && (pinSize > MinSizeForPinnedTransfer)) {
|
||||
if ((totalSize <= dev().settings().pinnedXferSize_) &&
|
||||
(totalSize > MinSizeForPinnedTransfer)) {
|
||||
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "Pinned write copy for size=%ld", totalSize);
|
||||
size_t partial;
|
||||
amd::Memory* amdMemory = pinHostMemory(srcHost, pinSize, partial);
|
||||
amd::Memory* amdMemory = pinHostMemory(srcHost, totalSize, partial);
|
||||
|
||||
if (amdMemory == nullptr) {
|
||||
// Force SW copy
|
||||
result = DmaBlitManager::writeBuffer(srcHost, dstMemory, origin, size, entire, copyMetadata);
|
||||
result = DmaBlitManager::writeBuffer(srcHost, dstMemory, origin,
|
||||
size, entire, copyMetadata);
|
||||
synchronize();
|
||||
return result;
|
||||
}
|
||||
@@ -1960,7 +1835,47 @@ bool KernelBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemo
|
||||
// Add pinned memory for a later release
|
||||
gpu().addPinnedMem(amdMemory);
|
||||
} else {
|
||||
result = DmaBlitManager::writeBuffer(srcHost, dstMemory, origin, size, entire, copyMetadata);
|
||||
// Do a staging copy
|
||||
bool useShaderCopyPath = setup_.disableHwlCopyBuffer_ ||
|
||||
(totalSize <= dev().settings().sdmaCopyThreshold_) ||
|
||||
(copyMetadata.copyEnginePreference_ ==
|
||||
amd::CopyMetadata::CopyEnginePreference::BLIT);
|
||||
|
||||
if (!useShaderCopyPath) {
|
||||
// HSA copy using a staging resource
|
||||
result = DmaBlitManager::writeBuffer(srcHost, dstMemory, origin,
|
||||
size, entire, copyMetadata);
|
||||
}
|
||||
|
||||
if (!result) {
|
||||
// Blit copy using a staging resource
|
||||
address dstAddr = gpuMem(dstMemory).getDeviceMemory();
|
||||
const_address srcAddr = reinterpret_cast<const_address>(srcHost);
|
||||
amd::Coord3D srcOrigin(0, 0, 0);
|
||||
size_t copySize = 0;
|
||||
size_t stagedCopyOffset = 0;
|
||||
size_t maxStagedXferSize = dev().settings().stagedXferSize_;
|
||||
|
||||
while (totalSize > 0) {
|
||||
copySize = std::min(totalSize, maxStagedXferSize);
|
||||
// Get an address from managed staging buffer
|
||||
address stagingBuffer = gpu().Staging().Acquire(std::min(copySize, maxStagedXferSize));
|
||||
dstAddr += stagedCopyOffset;
|
||||
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "memcpy stg buf=%p, host src=%p, size=%zu",
|
||||
stagingBuffer, (void*)(srcAddr + stagedCopyOffset), copySize);
|
||||
memcpy(stagingBuffer, srcAddr + stagedCopyOffset, copySize);
|
||||
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "Blit staging H2D copy dst=%p, stg buf=%p, "
|
||||
"dstOrigin=%zu, size=%zu", dstAddr, stagingBuffer, origin[0], copySize);
|
||||
result = shaderCopyBuffer(dstAddr, stagingBuffer,
|
||||
origin, srcOrigin, copySize,
|
||||
entire, dev().settings().limit_blit_wg_, copyMetadata);
|
||||
if (!result) {
|
||||
break;
|
||||
}
|
||||
totalSize -= copySize;
|
||||
stagedCopyOffset += copySize;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2230,6 +2145,68 @@ bool KernelBlitManager::fillBuffer3D(device::Memory& memory, const void* pattern
|
||||
ShouldNotReachHere();
|
||||
return false;
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
bool KernelBlitManager::shaderCopyBuffer(address dst, address src,
|
||||
const amd::Coord3D& dstOrigin,
|
||||
const amd::Coord3D& srcOrigin,
|
||||
const amd::Coord3D& sizeIn, bool entire,
|
||||
const uint32_t blitWg,
|
||||
amd::CopyMetadata copyMetadata,
|
||||
bool attachSignal) const {
|
||||
constexpr uint32_t kBlitType = BlitCopyBuffer;
|
||||
constexpr uint32_t kMaxAlignment = 2 * sizeof(uint64_t);
|
||||
amd::Coord3D size(sizeIn[0]);
|
||||
|
||||
// Check alignments for source and destination
|
||||
bool aligned = ((srcOrigin[0] % kMaxAlignment) == 0) && ((dstOrigin[0] % kMaxAlignment) == 0);
|
||||
uint32_t aligned_size = (aligned) ? kMaxAlignment : sizeof(uint32_t);
|
||||
|
||||
// Setup copy size accordingly to the alignment
|
||||
uint32_t remainder = size[0] % aligned_size;
|
||||
size.c[0] /= aligned_size;
|
||||
size.c[0] += (remainder != 0) ? 1 : 0;
|
||||
|
||||
// Program the dispatch dimensions
|
||||
const size_t localWorkSize = (aligned) ? 512 : 1024;
|
||||
size_t globalWorkSize = std::min(blitWg * localWorkSize, size[0]);
|
||||
globalWorkSize = amd::alignUp(globalWorkSize, localWorkSize);
|
||||
|
||||
// Program kernels arguments for the blit operation
|
||||
// Program source origin
|
||||
setArgument(kernels_[kBlitType], 0, sizeof(src), reinterpret_cast<void*>(src),
|
||||
srcOrigin[0], nullptr, true);
|
||||
|
||||
// Program destinaiton origin
|
||||
setArgument(kernels_[kBlitType], 1, sizeof(dst), reinterpret_cast<void*>(dst),
|
||||
dstOrigin[0], nullptr, true);
|
||||
|
||||
uint64_t copySize = sizeIn[0];
|
||||
setArgument(kernels_[kBlitType], 2, sizeof(copySize), ©Size);
|
||||
|
||||
setArgument(kernels_[kBlitType], 3, sizeof(remainder), &remainder);
|
||||
setArgument(kernels_[kBlitType], 4, sizeof(aligned_size), &aligned_size);
|
||||
|
||||
// End pointer is the aligned copy size and destination offset
|
||||
uint64_t end_ptr = reinterpret_cast<uint64_t>(dst) + dstOrigin[0] + sizeIn[0] - remainder;
|
||||
|
||||
setArgument(kernels_[kBlitType], 5, sizeof(end_ptr), &end_ptr);
|
||||
|
||||
uint32_t next_chunk = globalWorkSize;
|
||||
setArgument(kernels_[kBlitType], 6, sizeof(next_chunk), &next_chunk);
|
||||
|
||||
// Create ND range object for the kernel's execution
|
||||
amd::NDRangeContainer ndrange(1, nullptr, &globalWorkSize, &localWorkSize);
|
||||
|
||||
// Execute the blit
|
||||
address parameters = captureArguments(kernels_[kBlitType]);
|
||||
bool result = gpu().submitKernelInternal(ndrange, *kernels_[kBlitType], parameters, nullptr,
|
||||
0, nullptr, nullptr, attachSignal);
|
||||
releaseArguments(parameters);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
bool KernelBlitManager::copyBuffer(device::Memory& srcMemory, device::Memory& dstMemory,
|
||||
const amd::Coord3D& srcOrigin, const amd::Coord3D& dstOrigin,
|
||||
@@ -2238,32 +2215,28 @@ bool KernelBlitManager::copyBuffer(device::Memory& srcMemory, device::Memory& ds
|
||||
amd::ScopedLock k(lockXferOps_);
|
||||
bool result = false;
|
||||
bool p2p = false;
|
||||
uint32_t blit_wg_ = dev().settings().limit_blit_wg_;
|
||||
uint32_t blitWg = dev().settings().limit_blit_wg_;
|
||||
|
||||
if (&gpuMem(srcMemory).dev() != &gpuMem(dstMemory).dev()) {
|
||||
if (sizeIn[0] > dev().settings().sdma_p2p_threshold_) {
|
||||
p2p = true;
|
||||
} else {
|
||||
constexpr uint32_t kLimitWgForKernelP2p = 16;
|
||||
blit_wg_ = kLimitWgForKernelP2p;
|
||||
blitWg = kLimitWgForKernelP2p;
|
||||
}
|
||||
}
|
||||
|
||||
bool asan = false;
|
||||
bool ipcShared = srcMemory.owner()->ipcShared() || dstMemory.owner()->ipcShared();
|
||||
#if defined(__clang__)
|
||||
#if __has_feature(address_sanitizer)
|
||||
asan = true;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
bool useShaderCopyPath = setup_.disableHwlCopyBuffer_ ||
|
||||
(sizeIn[0] <= dev().settings().sdmaCopyThreshold_) ||
|
||||
(!(p2p || asan || ipcShared) &&
|
||||
(!srcMemory.isHostMemDirectAccess() && !dstMemory.isHostMemDirectAccess() &&
|
||||
!(copyMetadata.copyEnginePreference_ ==
|
||||
amd::CopyMetadata::CopyEnginePreference::SDMA)) ||
|
||||
(copyMetadata.copyEnginePreference_ == amd::CopyMetadata::CopyEnginePreference::BLIT));
|
||||
bool useShaderCopyPath = setup_.disableHwlCopyBuffer_ ||
|
||||
(sizeIn[0] <= dev().settings().sdmaCopyThreshold_) ||
|
||||
(!(p2p || ipcShared) &&
|
||||
(!srcMemory.isHostMemDirectAccess()
|
||||
&& !dstMemory.isHostMemDirectAccess() &&
|
||||
!(copyMetadata.copyEnginePreference_ ==
|
||||
amd::CopyMetadata::CopyEnginePreference::SDMA)) ||
|
||||
(copyMetadata.copyEnginePreference_ ==
|
||||
amd::CopyMetadata::CopyEnginePreference::BLIT));
|
||||
|
||||
if (!useShaderCopyPath) {
|
||||
if (amd::IS_HIP) {
|
||||
@@ -2275,60 +2248,15 @@ bool KernelBlitManager::copyBuffer(device::Memory& srcMemory, device::Memory& ds
|
||||
gpu().SetCopyCommandType(CL_COMMAND_READ_BUFFER);
|
||||
}
|
||||
}
|
||||
result = DmaBlitManager::copyBuffer(srcMemory, dstMemory, srcOrigin, dstOrigin, sizeIn, entire,
|
||||
copyMetadata);
|
||||
result = DmaBlitManager::copyBuffer(srcMemory, dstMemory, srcOrigin, dstOrigin, sizeIn,
|
||||
entire, copyMetadata);
|
||||
}
|
||||
|
||||
if (!result) {
|
||||
constexpr uint32_t kBlitType = BlitCopyBuffer;
|
||||
constexpr uint32_t kMaxAlignment = 2 * sizeof(uint64_t);
|
||||
amd::Coord3D size(sizeIn[0]);
|
||||
|
||||
// Check alignments for source and destination
|
||||
bool aligned = ((srcOrigin[0] % kMaxAlignment) == 0) && ((dstOrigin[0] % kMaxAlignment) == 0);
|
||||
uint32_t aligned_size = (aligned) ? kMaxAlignment : sizeof(uint32_t);
|
||||
|
||||
// Setup copy size accordingly to the alignment
|
||||
uint32_t remainder = size[0] % aligned_size;
|
||||
size.c[0] /= aligned_size;
|
||||
size.c[0] += (remainder != 0) ? 1 : 0;
|
||||
|
||||
// Program the dispatch dimensions
|
||||
const size_t localWorkSize = (aligned) ? 512 : 1024;
|
||||
size_t globalWorkSize = std::min(blit_wg_ * localWorkSize, size[0]);
|
||||
globalWorkSize = amd::alignUp(globalWorkSize, localWorkSize);
|
||||
|
||||
// Program kernels arguments for the blit operation
|
||||
cl_mem mem = as_cl<amd::Memory>(srcMemory.owner());
|
||||
// Program source origin
|
||||
uint64_t srcOffset = srcOrigin[0];
|
||||
setArgument(kernels_[kBlitType], 0, sizeof(cl_mem), &mem, srcOffset, &srcMemory);
|
||||
mem = as_cl<amd::Memory>(dstMemory.owner());
|
||||
// Program destinaiton origin
|
||||
uint64_t dstOffset = dstOrigin[0];
|
||||
setArgument(kernels_[kBlitType], 1, sizeof(cl_mem), &mem, dstOffset, &dstMemory);
|
||||
|
||||
uint64_t copySize = sizeIn[0];
|
||||
setArgument(kernels_[kBlitType], 2, sizeof(copySize), ©Size);
|
||||
|
||||
setArgument(kernels_[kBlitType], 3, sizeof(remainder), &remainder);
|
||||
setArgument(kernels_[kBlitType], 4, sizeof(aligned_size), &aligned_size);
|
||||
|
||||
// End pointer is the aligned copy size and destination offset
|
||||
uint64_t end_ptr = dstMemory.virtualAddress() + dstOffset + sizeIn[0] - remainder;
|
||||
|
||||
setArgument(kernels_[kBlitType], 5, sizeof(end_ptr), &end_ptr);
|
||||
|
||||
uint32_t next_chunk = globalWorkSize;
|
||||
setArgument(kernels_[kBlitType], 6, sizeof(next_chunk), &next_chunk);
|
||||
|
||||
// Create ND range object for the kernel's execution
|
||||
amd::NDRangeContainer ndrange(1, nullptr, &globalWorkSize, &localWorkSize);
|
||||
|
||||
// Execute the blit
|
||||
address parameters = captureArguments(kernels_[kBlitType]);
|
||||
result = gpu().submitKernelInternal(ndrange, *kernels_[kBlitType], parameters, nullptr);
|
||||
releaseArguments(parameters);
|
||||
result = shaderCopyBuffer(reinterpret_cast<address>(dstMemory.virtualAddress()),
|
||||
reinterpret_cast<address>(srcMemory.virtualAddress()),
|
||||
dstOrigin, srcOrigin, sizeIn,
|
||||
entire, blitWg, copyMetadata);
|
||||
}
|
||||
|
||||
synchronize();
|
||||
|
||||
@@ -231,7 +231,11 @@ class DmaBlitManager : public device::HostBlitManager {
|
||||
//! taking into account the Hsail profile supported by Hsa Agent
|
||||
bool hsaCopy(const Memory& srcMemory, const Memory& dstMemory, const amd::Coord3D& srcOrigin,
|
||||
const amd::Coord3D& dstOrigin, const amd::Coord3D& size,
|
||||
amd::CopyMetadata copyMetadata) const;
|
||||
amd::CopyMetadata& copyMetadata) const;
|
||||
|
||||
inline bool rocrCopyBuffer(address dst, hsa_agent_t& dstAgent,
|
||||
const_address src, hsa_agent_t& srcAgent, size_t size,
|
||||
amd::CopyMetadata& copyMetadata) const;
|
||||
|
||||
const size_t MinSizeForPinnedTransfer;
|
||||
bool completeOperation_; //!< DMA blit manager must complete operation
|
||||
@@ -248,33 +252,13 @@ class DmaBlitManager : public device::HostBlitManager {
|
||||
//! Disable operator=
|
||||
DmaBlitManager& operator=(const DmaBlitManager&);
|
||||
|
||||
//! Reads video memory, using a staged buffer
|
||||
bool readMemoryStaged(Memory& srcMemory, //!< Source memory object
|
||||
void* dstHost, //!< Destination host memory
|
||||
Memory& xferBuf, //!< Staged buffer for read
|
||||
size_t origin, //!< Original offset in the source memory
|
||||
size_t& offset, //!< Offset for the current copy pointer
|
||||
size_t& totalSize, //!< Total size for copy region
|
||||
size_t xferSize //!< Transfer size
|
||||
) const;
|
||||
|
||||
//! Write into video memory, using a staged buffer
|
||||
bool writeMemoryStaged(const void* srcHost, //!< Source host memory
|
||||
Memory& dstMemory, //!< Destination memory object
|
||||
address staging, //!< Staged buffer for write
|
||||
size_t origin, //!< Original offset in the destination memory
|
||||
size_t& offset, //!< Offset for the current copy pointer
|
||||
size_t& totalSize, //!< Total size for the copy region
|
||||
size_t xferSize //!< Transfer size
|
||||
) const;
|
||||
|
||||
//! Assits in transferring data from Host to Local or vice versa
|
||||
//! taking into account the Hsail profile supported by Hsa Agent
|
||||
bool hsaCopyStaged(const_address hostSrc, //!< Contains source data to be copied
|
||||
address hostDst, //!< Destination buffer address for copying
|
||||
size_t size, //!< Size of data to copy in bytes
|
||||
address staging, //!< Staging resource
|
||||
bool hostToDev //!< True if data is copied from Host To Device
|
||||
bool hsaCopyStaged(const_address hostSrc, //!< Contains source data to be copied
|
||||
address hostDst, //!< Destination buffer address for copying
|
||||
size_t size, //!< Size of data to copy in bytes
|
||||
bool hostToDev, //!< True if data is copied from H2D
|
||||
amd::CopyMetadata& copyMetadata //!< Memory copy MetaData
|
||||
) const;
|
||||
|
||||
bool forceHostWaitFunc(size_t copy_size) const;
|
||||
@@ -583,6 +567,12 @@ class KernelBlitManager : public DmaBlitManager {
|
||||
return (dev().info().imageSupport_) ? BlitTotal : BlitLinearTotal;
|
||||
}
|
||||
|
||||
//! Copies a buffer using the shader path
|
||||
bool shaderCopyBuffer(address dst, address src,
|
||||
const amd::Coord3D& dstOrigin, const amd::Coord3D& srcOrigin,
|
||||
const amd::Coord3D& size, bool entire, const uint32_t blitWg,
|
||||
amd::CopyMetadata copyMetadata, bool attachSignal = false) const;
|
||||
|
||||
//! Disable copy constructor
|
||||
KernelBlitManager(const KernelBlitManager&);
|
||||
|
||||
|
||||
@@ -1202,7 +1202,8 @@ bool Device::populateOCLDeviceConstants() {
|
||||
&info_.globalMemCacheLineSize_)) {
|
||||
return false;
|
||||
}
|
||||
assert(info_.globalMemCacheLineSize_ > 0);
|
||||
info_.globalMemCacheLineSize_ = (info_.globalMemCacheLineSize_ != 0) ?
|
||||
info_.globalMemCacheLineSize_ : 64;
|
||||
|
||||
uint32_t cachesize[4] = {0};
|
||||
if (HSA_STATUS_SUCCESS !=
|
||||
|
||||
@@ -57,8 +57,7 @@ bool LightningKernel::postLoad() {
|
||||
}
|
||||
kernargSegmentAlignment_ =
|
||||
amd::alignUp(std::max(kernargSegmentAlignment_, 128u),
|
||||
device().info().globalMemCacheLineSize_ > 0 ?
|
||||
device().info().globalMemCacheLineSize_ : 64);
|
||||
device().info().globalMemCacheLineSize_);
|
||||
|
||||
// Set the workgroup information for the kernel
|
||||
workGroupInfo_.availableLDSSize_ = device().info().localMemSizePerCU_;
|
||||
|
||||
@@ -497,7 +497,8 @@ hsa_signal_t VirtualGPU::HwQueueTracker::ActiveSignal(
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
std::vector<hsa_signal_t>& VirtualGPU::HwQueueTracker::WaitingSignal(HwQueueEngine engine) {
|
||||
std::vector<hsa_signal_t>& VirtualGPU::HwQueueTracker::WaitingSignal(HwQueueEngine engine,
|
||||
bool forceHostWait) {
|
||||
bool explicit_wait = false;
|
||||
// Reset all current waiting signals
|
||||
waiting_signals_.clear();
|
||||
@@ -545,8 +546,8 @@ std::vector<hsa_signal_t>& VirtualGPU::HwQueueTracker::WaitingSignal(HwQueueEngi
|
||||
const Settings& settings = gpu_.dev().settings();
|
||||
// Actively wait on CPU to avoid extra overheads of signal tracking on GPU.
|
||||
// For small copies set forced wait
|
||||
if (!WaitForSignal<true>(external_signals_[i]->signal_, false,
|
||||
external_signals_[i]->flags_.forceHostWait_)) {
|
||||
if (!WaitForSignal<true>(external_signals_[i]->signal_, false, forceHostWait ?
|
||||
external_signals_[i]->flags_.forceHostWait_ : false)) {
|
||||
if (settings.cpu_wait_for_signal_) {
|
||||
// Wait on CPU for completion if requested
|
||||
CpuWaitForSignal(external_signals_[i]);
|
||||
@@ -713,6 +714,9 @@ bool VirtualGPU::processMemObjects(const amd::Kernel& kernel, const_address para
|
||||
else {
|
||||
uint32_t index = desc.info_.arrayIndex_;
|
||||
mem = memories[index];
|
||||
const void* globalAddress = *reinterpret_cast<const void* const*>(params + desc.offset_);
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_KERN,
|
||||
"Arg%d: %s %s = ptr:%p", i, desc.typeName_.c_str(), desc.name_.c_str(), globalAddress);
|
||||
if (mem == nullptr) {
|
||||
//! This condition is for SVM fine-grain
|
||||
if (dev().isFineGrainedSystem(true)) {
|
||||
@@ -839,7 +843,7 @@ static inline void packet_store_release(uint32_t* packet, uint16_t header, uint1
|
||||
// ================================================================================================
|
||||
template <typename AqlPacket>
|
||||
bool VirtualGPU::dispatchGenericAqlPacket(
|
||||
AqlPacket* packet, uint16_t header, uint16_t rest, bool blocking) {
|
||||
AqlPacket* packet, uint16_t header, uint16_t rest, bool blocking, bool attach_signal) {
|
||||
const uint32_t queueSize = gpu_queue_->size;
|
||||
const uint32_t queueMask = queueSize - 1;
|
||||
const uint32_t sw_queue_size = queueMask;
|
||||
@@ -847,6 +851,7 @@ bool VirtualGPU::dispatchGenericAqlPacket(
|
||||
// Check for queue full and wait if needed.
|
||||
uint64_t index = hsa_queue_add_write_index_screlease(gpu_queue_, 1);
|
||||
uint64_t read = hsa_queue_load_read_index_relaxed(gpu_queue_);
|
||||
|
||||
if (addSystemScope_) {
|
||||
header &= ~(HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE |
|
||||
HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
|
||||
@@ -858,15 +863,15 @@ bool VirtualGPU::dispatchGenericAqlPacket(
|
||||
auto expected_fence_state = extractAqlBits(header, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE,
|
||||
HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE);
|
||||
|
||||
if (fence_state_ == amd::Device::kCacheStateSystem &&
|
||||
expected_fence_state == amd::Device::kCacheStateSystem) {
|
||||
if (fence_state_ == amd::Device::kCacheStateSystem
|
||||
&& expected_fence_state == amd::Device::kCacheStateSystem) {
|
||||
header = dispatchPacketHeader_;
|
||||
fence_dirty_ = true;
|
||||
}
|
||||
|
||||
fence_state_ = static_cast<Device::CacheState>(expected_fence_state);
|
||||
|
||||
if (timestamp_ != nullptr) {
|
||||
if (timestamp_ != nullptr || attach_signal) {
|
||||
// Get active signal for current dispatch if profiling is necessary
|
||||
packet->completion_signal = Barriers().ActiveSignal(kInitSignalValueOne, timestamp_);
|
||||
|
||||
@@ -967,7 +972,7 @@ void VirtualGPU::dispatchBlockingWait() {
|
||||
// ================================================================================================
|
||||
bool VirtualGPU::dispatchAqlPacket(hsa_kernel_dispatch_packet_t* packet, uint16_t header,
|
||||
uint16_t rest, bool blocking, bool capturing,
|
||||
const uint8_t* aqlPacket) {
|
||||
const uint8_t* aqlPacket, bool attach_signal) {
|
||||
if (capturing == true) {
|
||||
packet->header = header;
|
||||
packet->setup = rest;
|
||||
@@ -975,13 +980,13 @@ bool VirtualGPU::dispatchAqlPacket(hsa_kernel_dispatch_packet_t* packet, uint16_
|
||||
return true;
|
||||
} else {
|
||||
dispatchBlockingWait();
|
||||
return dispatchGenericAqlPacket(packet, header, rest, blocking);
|
||||
return dispatchGenericAqlPacket(packet, header, rest, blocking, attach_signal);
|
||||
}
|
||||
}
|
||||
// ================================================================================================
|
||||
bool VirtualGPU::dispatchAqlPacket(
|
||||
hsa_barrier_and_packet_t* packet, uint16_t header, uint16_t rest, bool blocking) {
|
||||
return dispatchGenericAqlPacket(packet, header, rest, blocking);
|
||||
bool VirtualGPU::dispatchAqlPacket(hsa_barrier_and_packet_t* packet, uint16_t header, uint16_t rest,
|
||||
bool blocking, bool attach_signal) {
|
||||
return dispatchGenericAqlPacket(packet, header, rest, blocking, attach_signal);
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
@@ -1074,10 +1079,9 @@ void VirtualGPU::dispatchBarrierPacket(uint16_t packetHeader, bool skipSignal,
|
||||
barrier_packet_.completion_signal = signal;
|
||||
}
|
||||
|
||||
// Reset fence_dirty_ and addSystemScope_ flag if we submit a barrier with system scopes
|
||||
// Reset fence_dirty_ flag if we submit a barrier with system scopes
|
||||
if (cache_state == amd::Device::kCacheStateSystem) {
|
||||
fence_dirty_ = false;
|
||||
addSystemScope_ = false;
|
||||
}
|
||||
|
||||
while ((index - hsa_queue_load_read_index_scacquire(gpu_queue_)) >= queueMask);
|
||||
@@ -1424,7 +1428,7 @@ bool VirtualGPU::ManagedBuffer::Create() {
|
||||
|
||||
// ================================================================================================
|
||||
address VirtualGPU::ManagedBuffer::Acquire(uint32_t size) {
|
||||
auto alignment = gpu_.dev().info().globalMemCacheLineSize_;
|
||||
auto alignment = amd::alignUp(256u, gpu_.dev().info().globalMemCacheLineSize_);
|
||||
address result = nullptr;
|
||||
result = amd::alignUp(pool_base_ + pool_cur_offset_, alignment);
|
||||
const size_t pool_new_usage = (result + size) - pool_base_;
|
||||
@@ -1713,7 +1717,8 @@ void VirtualGPU::submitReadMemory(amd::ReadMemoryCommand& cmd) {
|
||||
bool imageBuffer = false;
|
||||
|
||||
// Force buffer read for IMAGE1D_BUFFER
|
||||
if ((type == CL_COMMAND_READ_IMAGE) && (cmd.source().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) {
|
||||
if ((type == CL_COMMAND_READ_IMAGE) &&
|
||||
(cmd.source().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) {
|
||||
type = CL_COMMAND_READ_BUFFER;
|
||||
imageBuffer = true;
|
||||
}
|
||||
@@ -1732,7 +1737,8 @@ void VirtualGPU::submitReadMemory(amd::ReadMemoryCommand& cmd) {
|
||||
result = blitMgr().copyBuffer(*devMem, *hostMemory, origin, dstOrigin, size,
|
||||
cmd.isEntireMemory(), cmd.copyMetadata());
|
||||
} else {
|
||||
result = blitMgr().readBuffer(*devMem, dst, origin, size, cmd.isEntireMemory(), cmd.copyMetadata());
|
||||
result = blitMgr().readBuffer(*devMem, dst, origin, size,
|
||||
cmd.isEntireMemory(), cmd.copyMetadata());
|
||||
}
|
||||
break;
|
||||
}
|
||||
@@ -1752,7 +1758,8 @@ void VirtualGPU::submitReadMemory(amd::ReadMemoryCommand& cmd) {
|
||||
break;
|
||||
}
|
||||
case CL_COMMAND_READ_IMAGE: {
|
||||
if ((cmd.source().parent() != nullptr) && (cmd.source().parent()->getType() == CL_MEM_OBJECT_BUFFER)) {
|
||||
if ((cmd.source().parent() != nullptr) &&
|
||||
(cmd.source().parent()->getType() == CL_MEM_OBJECT_BUFFER)) {
|
||||
Image* imageBuffer = static_cast<Image*>(devMem);
|
||||
// Check if synchronization has to be performed
|
||||
if (nullptr != imageBuffer->CopyImageBuffer()) {
|
||||
@@ -1772,7 +1779,8 @@ void VirtualGPU::submitReadMemory(amd::ReadMemoryCommand& cmd) {
|
||||
amd::Coord3D dstOrigin(offset);
|
||||
result =
|
||||
blitMgr().copyImageToBuffer(*devMem, *hostMemory, cmd.origin(), dstOrigin, size,
|
||||
cmd.isEntireMemory(), cmd.rowPitch(), cmd.slicePitch(), cmd.copyMetadata());
|
||||
cmd.isEntireMemory(), cmd.rowPitch(),
|
||||
cmd.slicePitch(), cmd.copyMetadata());
|
||||
} else {
|
||||
result = blitMgr().readImage(*devMem, dst, cmd.origin(), size, cmd.rowPitch(),
|
||||
cmd.slicePitch(), cmd.isEntireMemory(), cmd.copyMetadata());
|
||||
@@ -1839,7 +1847,8 @@ void VirtualGPU::submitWriteMemory(amd::WriteMemoryCommand& cmd) {
|
||||
result = blitMgr().copyBuffer(*hostMemory, *devMem, srcOrigin, origin, size,
|
||||
cmd.isEntireMemory(), cmd.copyMetadata());
|
||||
} else {
|
||||
result = blitMgr().writeBuffer(src, *devMem, origin, size, cmd.isEntireMemory(), cmd.copyMetadata());
|
||||
result = blitMgr().writeBuffer(src, *devMem, origin, size,
|
||||
cmd.isEntireMemory(), cmd.copyMetadata());
|
||||
}
|
||||
break;
|
||||
}
|
||||
@@ -3131,9 +3140,9 @@ void VirtualGPU::HiddenHeapInit() { const_cast<Device&>(dev()).HiddenHeapInit(*t
|
||||
|
||||
// ================================================================================================
|
||||
bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
|
||||
const amd::Kernel& kernel, const_address parameters, void* eventHandle,
|
||||
const amd::Kernel& kernel, const_address parameters, void* event_handle,
|
||||
uint32_t sharedMemBytes, amd::NDRangeKernelCommand* vcmd,
|
||||
hsa_kernel_dispatch_packet_t* aql_packet) {
|
||||
hsa_kernel_dispatch_packet_t* aql_packet, bool attach_signal) {
|
||||
device::Kernel* devKernel = const_cast<device::Kernel*>(kernel.getDeviceKernel(dev()));
|
||||
Kernel& gpuKernel = static_cast<Kernel&>(*devKernel);
|
||||
size_t ldsUsage = gpuKernel.WorkgroupGroupSegmentByteSize();
|
||||
@@ -3482,7 +3491,6 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
|
||||
addSystemScope_ = true;
|
||||
}
|
||||
|
||||
|
||||
// Copy scheduler's AQL packet for possible relaunch from the scheduler itself
|
||||
if (aql_packet != nullptr) {
|
||||
*aql_packet = dispatchPacket;
|
||||
@@ -3504,7 +3512,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
|
||||
} else {
|
||||
if (!dispatchAqlPacket(&dispatchPacket, aqlHeaderWithOrder,
|
||||
(sizes.dimensions() << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS),
|
||||
GPU_FLUSH_ON_EXECUTION)) {
|
||||
GPU_FLUSH_ON_EXECUTION, false, nullptr, attach_signal)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -30,6 +30,7 @@
|
||||
#include "rocprintf.hpp"
|
||||
#include "hsa/hsa_ven_amd_aqlprofile.h"
|
||||
#include "rocsched.hpp"
|
||||
#include "device/device.hpp"
|
||||
|
||||
namespace amd::roc {
|
||||
class Device;
|
||||
@@ -270,7 +271,8 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
HwQueueEngine GetActiveEngine() const { return engine_; }
|
||||
|
||||
//! Returns the last submitted signal for a wait
|
||||
std::vector<hsa_signal_t>& WaitingSignal(HwQueueEngine engine = HwQueueEngine::Compute);
|
||||
std::vector<hsa_signal_t>& WaitingSignal(HwQueueEngine engine = HwQueueEngine::Compute,
|
||||
bool forceHostWait = true);
|
||||
|
||||
//! Resets current signal back to the previous one. It's necessary in a case of ROCr failure.
|
||||
void ResetCurrentSignal();
|
||||
@@ -341,8 +343,8 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
void* event_handle, //!< Handle to OCL event for debugging
|
||||
uint32_t sharedMemBytes = 0, //!< Shared memory size
|
||||
amd::NDRangeKernelCommand* vcmd = nullptr, //!< Original launch command
|
||||
hsa_kernel_dispatch_packet_t* aql_packet = nullptr //!< Scheduler launch
|
||||
);
|
||||
hsa_kernel_dispatch_packet_t* aql_packet = nullptr, //!< Scheduler launch
|
||||
bool attach_signal = false);
|
||||
void submitNativeFn(amd::NativeFnCommand& cmd);
|
||||
void submitMarker(amd::Marker& cmd);
|
||||
void submitAccumulate(amd::AccumulateCommand& cmd);
|
||||
@@ -420,7 +422,10 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
|
||||
void hasPendingDispatch() { hasPendingDispatch_ = true; }
|
||||
bool IsPendingDispatch() const { return (hasPendingDispatch_) ? true : false; }
|
||||
void addSystemScope() { addSystemScope_ = true; }
|
||||
void addSystemScope() {
|
||||
addSystemScope_ = true;
|
||||
fence_state_ = amd::Device::CacheState::kCacheStateInvalid;
|
||||
}
|
||||
void SetCopyCommandType(cl_command_type type) { copy_command_type_ = type; }
|
||||
|
||||
HwQueueTracker& Barriers() { return barriers_; }
|
||||
@@ -444,11 +449,12 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
amd::AccumulateCommand* vcmd = nullptr);
|
||||
bool dispatchAqlPacket(hsa_kernel_dispatch_packet_t* packet, uint16_t header, uint16_t rest,
|
||||
bool blocking = true, bool capturing = false,
|
||||
const uint8_t* aqlPacket = nullptr);
|
||||
const uint8_t* aqlPacket = nullptr, bool attach_signal = false);
|
||||
bool dispatchAqlPacket(hsa_barrier_and_packet_t* packet, uint16_t header,
|
||||
uint16_t rest, bool blocking = true);
|
||||
uint16_t rest, bool blocking = true, bool attach_signal = false);
|
||||
template <typename AqlPacket> bool dispatchGenericAqlPacket(AqlPacket* packet, uint16_t header,
|
||||
uint16_t rest, bool blocking);
|
||||
uint16_t rest, bool blocking,
|
||||
bool attach_signal = false);
|
||||
|
||||
bool dispatchCounterAqlPacket(hsa_ext_amd_aql_pm4_packet_t* packet, const uint32_t gfxVersion,
|
||||
bool blocking, const hsa_ven_amd_aqlprofile_1_00_pfn_t* extApi);
|
||||
|
||||
@@ -85,7 +85,7 @@ release(size_t, GPU_PINNED_MIN_XFER_SIZE, 128, \
|
||||
release(size_t, GPU_RESOURCE_CACHE_SIZE, 64, \
|
||||
"The resource cache size in MB") \
|
||||
release(size_t, GPU_MAX_SUBALLOC_SIZE, 4096, \
|
||||
"The maximum size accepted for suballocaitons in KB") \
|
||||
"The maximum size accepted for suballocations in KB") \
|
||||
release(size_t, GPU_NUM_MEM_DEPENDENCY, 256, \
|
||||
"Number of memory objects for dependency tracking") \
|
||||
release(size_t, GPU_XFER_BUFFER_SIZE, 0, \
|
||||
@@ -105,7 +105,7 @@ release(bool, GPU_USE_DEVICE_QUEUE, false, \
|
||||
release(bool, AMD_THREAD_TRACE_ENABLE, true, \
|
||||
"Enable thread trace extension") \
|
||||
release(uint, OPENCL_VERSION, 200, \
|
||||
"Force GPU opencl verison") \
|
||||
"Force GPU opencl version") \
|
||||
release(bool, HSA_LOCAL_MEMORY_ENABLE, true, \
|
||||
"Enable HSA device local memory usage") \
|
||||
release(uint, HSA_KERNARG_POOL_SIZE, 1024 * 1024, \
|
||||
@@ -186,7 +186,7 @@ release(bool, AMD_DIRECT_DISPATCH, false, \
|
||||
release(uint, HIP_HIDDEN_FREE_MEM, 0, \
|
||||
"Reserve free mem reporting in Mb" \
|
||||
"0 = Disable") \
|
||||
release(size_t, GPU_FORCE_BLIT_COPY_SIZE, 0, \
|
||||
release(size_t, GPU_FORCE_BLIT_COPY_SIZE, 16, \
|
||||
"Use Blit until this size(in KB) for copies") \
|
||||
release(uint, ROC_ACTIVE_WAIT_TIMEOUT, 0, \
|
||||
"Forces active wait of GPU interrup for the timeout(us)") \
|
||||
|
||||
Ссылка в новой задаче
Block a user