Add L2 flush/invalidate after CPU copy

CPU read updates L2 with the latest values and requires
invalidation after, because SDMA doesn't use L2 and data can become
out of sync.

Change-Id: I98d1c91ca78a103fa5409e638f97485d62d5b11e


[ROCm/clr commit: 18a821acde]
Tento commit je obsažen v:
German Andryeyev
2020-12-10 00:16:23 -05:00
rodič c2637f1589
revize f7cf40fc02
+10 -3
Zobrazit soubor
@@ -1654,14 +1654,21 @@ bool KernelBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost,
bool result = false;
if (dev().info().largeBar_ && size[0] <= kMaxD2hMemcpySize) {
if ((srcMemory.owner()->getHostMem() == nullptr) && (srcMemory.owner()->getSvmPtr() != nullptr)) {
if ((srcMemory.owner()->getHostMem() == nullptr) &&
(srcMemory.owner()->getSvmPtr() != nullptr)) {
// CPU read ahead, hence release GPU memory and force barrier to make sure L2 flush
constexpr bool ForceBarrier = true;
gpu().releaseGpuMemoryFence(ForceBarrier);
char* src = reinterpret_cast<char*>(srcMemory.owner()->getSvmPtr());
std::memcpy(dstHost, src + origin[0], size[0]);
// The first dispatch will invalidate L2
gpu().addSystemScope();
// Force L2 invalidation/flush, because CPU read goes through L2, but SDMA(Gfx9) doesn't.
// The sequence below can produce incorrect result without explicit flush:
// 1. H->D: SDMA
// 2. D->H: CPU Read L2 updated
// 3. H->D: SDMA Memory updated, SDMA doesn't use L2
// 4. D->H: CPU Read L2 flush above (releaseGpuMemoryFence()) corrupts memory with stale
// data from step 2 and then CPU reads invalid data
gpu().hasPendingDispatch();
return true;
}
}